Stemmer Catalan

Per compilar amb Snowball. Snowball
Lista de prefixas en catalan
Lista de prefixas en occitan
Modifying a Lucene Snowball Stemmer
Snowballstem

  1. routines (
  2. cleaning mark_regions
  3. R1 R2
  4. attached_pronoun
  5. standard_suffix
  6. verb_suffix
  7. residual_suffix
  8. )
  9.  
  10. externals ( stem )
  11.  
  12. integers ( p1 p2 )
  13.  
  14. groupings ( v )
  15.  
  16. stringescapes {}
  17.  
  18. /* special characters (in ISO Latin I) */
  19.  
  20. stringdef a' hex 'E1' // a-acute
  21. stringdef a` hex 'E0' // a-grave
  22. stringdef c, hex 'E7' // c-cedilla
  23. stringdef e' hex 'E9' // e-acute
  24. stringdef e` hex 'E8' // e-grave
  25. stringdef i' hex 'ED' // i-acute
  26. stringdef i` hex 'EC' // i-grave
  27. stringdef i" hex 'EF' // i-diaeresis
  28. stringdef o' hex 'F3' // o-acute
  29. stringdef o` hex 'F2' // o-grave
  30. stringdef u' hex 'FA' // u-acute
  31. stringdef u" hex 'FC' // u-diaeresis
  32. stringdef - hex '2D' // - per aggeminades
  33. stringdef . hex 'B7' // - per l aggeminades
  34.  
  35. define v 'aeiou{a'}{a`}{e'}{e`}{i'}{i"}{o'}{o`}{u'}{u"}'
  36.  
  37. define mark_regions as (
  38.  
  39. $p1 = limit
  40. $p2 = limit // defaults
  41.  
  42. do (
  43. gopast v gopast non-v setmark p1
  44. gopast v gopast non-v setmark p2
  45. )
  46. )
  47.  
  48. define cleaning as repeat (
  49. [substring] among(
  50. '{a'}' (<- 'a')
  51. '{a`}' (<- 'a')
  52. '{e'}' (<- 'e')
  53. '{e`}' (<- 'e')
  54. '{i'}' (<- 'i')
  55. '{i`}' (<- 'i')
  56. '{o'}' (<- 'o')
  57. '{o`}' (<- 'o')
  58. '{u'}' (<- 'u')
  59. '{u"}' (<- 'u')
  60. '{i"}' (<- 'i')
  61. '{.}' (<- '.')
  62. '' (next)
  63. ) //or next
  64. )
  65.  
  66. backwardmode (
  67.  
  68. define R1 as $p1 <= cursor
  69. define R2 as $p2 <= cursor
  70.  
  71. define attached_pronoun as (
  72. [substring] among (
  73. '{'}s' '{'}hi' '{'}ho' '{'}l' '{'}ls'
  74. '{-}ls' '{-}la' '{-}les' '{-}li'
  75. 'vos' 'se' 'nos' '{-}nos' '{-}us' 'us'
  76. '{'}n' '{'}ns' '{-}n' '{-}ns'
  77. '{'}m' '{-}me' '{-}m'
  78. '{-}te' '{'}t'
  79. 'li' 'lo' 'los'
  80. 'me' 'sela' 'selo' 'selas' 'selos' 'le'
  81. 'la' 'las' 'les' 'ens' 'ho' 'hi'
  82. (R1 delete)
  83. )
  84. )
  85.  
  86. define standard_suffix as (
  87. [substring] among(
  88. 'ar' 'atge' 'formes' 'icte' 'ictes'
  89. 'ell' 'ells' 'ella' '{e'}s' '{e`}s' 'esc' 'essa' 'et' 'ets' 'eta'
  90. 'eres' 'eries' 'ers' 'ina' 'ines' 'able' 'ls'
  91. 'i{o'}' 'itat' 'itats' 'itzar' 'iva' 'ives' 'ivisme' 'ius'
  92. 'fer' 'ment' 'amen' 'ament' 'aments' 'ments' 'ot' 'sfera' 'al' 'als' 'era' 'ana' 'iste'
  93. 'aire' 'eria' 'esa' 'eses' 'esos' 'or' '{i'}cia' '{i'}cies' 'icis' 'ici' '{i'}ci' '{i'}cis'
  94. '{a`}ria' '{a`}ries' 'alla' 'ci{o'}' 'cions' 'n{c,}a' 'nces' '{o'}' 'dor' 'all'
  95. 'il' '{i'}stic' 'enc' 'enca' '{i'}s' 'issa' 'issos' '{i'}ssem' '{i'}ssiu' 'issem' 'isseu' '{i'}sseu'
  96. '{o'}s' 'osa' 'dora' 'dores' 'dors' 'adura' 'ble' 'bles' '{i'}vol' '{i'}vola' 'd{i'}s' 'egar' 'ejar' 'ificar'
  97. 'itar' 'ables' 'adors' 'idores' 'idors'
  98. 'adora' 'aci{o'}' 'doras' 'dur' 'dures' 'alleng{u"}es'
  99. 'ant' 'ants' 'ancia' 'ancies' 'at{o`}ria' 'at{o`}ries' 'tori' 'toris'
  100. 'ats' 'ions' 'ota' 'isam' 'ors' 'ora' 'ores' 'isament'
  101. 'bilitat' 'bilitats' 'ivitat' 'ivitats' 'ari' 'aris' 'ionisme' 'ionista' 'ionistes'
  102. 'ialista' 'ialistes' 'ialisme' 'ialismes' 'ud' 'uts' 'uds' 'encia' 'encies' '{e`}ncia' '{e`}ncies'
  103. '{i"}tat' '{i"}tats' 'atiu' 'atius' 'atives' 'ativa' 'ativitat' 'ativitats' 'ible' 'ibles'
  104. 'assa' 'asses' 'assos'
  105. 'ent' 'ents'
  106. '{i'}ssim' '{i'}ssima' '{i'}ssims' '{i'}ssimes' '{i`}ssem' '{i`}sseu' '{i`}ssin'
  107. 'ims' 'ima' 'imes'
  108. 'isme' 'ista' 'ismes' 'istes'
  109. 'inia' 'inies' '{i'}inia' '{i'}nies' 'ita' 'ites' 'triu' 'trius'
  110. 'oses' 'osos' 'ient' 'otes' 'ots'
  111. (R1 delete)
  112. 'acions' 'ada' 'ades'
  113. (R2 delete)
  114. 'log{i'}a' 'log{i'}es''logia' 'logies' 'logi' 'logis' 'l{o'}gica' 'l{o'}gics' 'l{o'}giques'
  115. (R2 <- 'log')
  116. 'ic' 'ica' 'ics' 'iques'
  117. (R2 <- 'ic')
  118. 'qu{i'}ssim' 'qu{i'}ssims' 'qu{i'}ssimes' 'qu{i'}ssima'
  119. (R1 <- 'c')
  120. )
  121. )
  122.  
  123. define verb_suffix as (
  124. [substring] among(
  125. 'ador' 'adora' 'adors' 'adores' 're' 'ie'
  126. 'ent' 'ents' 'udes' 'ar{a`}' 'eren'
  127. 'ar{a'}' 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais'
  128. 'aria' 'arian' 'arien' 'aries' 'ar{a`}s'
  129. 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ara'
  130. 'ar{e'}' 'ar{e'}s'
  131. 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais'
  132. 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}'
  133. 'er{e'}' 'er' 'erau' 'erass'
  134. 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais'
  135. 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}'
  136. 'ir{e'}' '{i'}rem' '{i'}reu' '{i'}eu'
  137. 'ia' 'ies' '{i'}em' '{i`}eu' 'ien'
  138. 'at' 'ut' 'uda' 'ava' 'aves' 'avem' '{a'}vem' '{a`}vem' '{a`}veu' '{a'}veu' 'aven' 'au' 'ats'
  139. 'asseu' 'esseu' 'eresseu' '{a`}sseu' '{a`}ssem' '{a`}ssim' '{a`}ssiu'
  140. 'essen' 'esses' 'assen' 'asses' 'assim' 'assiu'
  141. '{e'}ssen' '{e'}sseu' '{e'}ssim' '{e'}ssiu' '{e'}ssem'
  142. '{i'}' 'ares' '{a`}rem' '{a`}reu' '{a`}ren'
  143. 'ar{i'}em' 'ar{i'}eu'
  144. 'areu' 'aren' 'ant' '{i"}m' '{i"}u'
  145. '{e'}s' '{i"}en' 'en' 'es' 'em' 'am' 'ams' '{i"}a' '{i"}es'
  146. 'dre' 'eix' 'eixer' 'tzar' 'eixes' 'ides' '{i"}des' 'it' '{i"}t' '{i"}da'
  147. 'aba' 'ada' 'ades' 'ida' '{i'}a' 'iera' 'ad' 'ed' 'its'
  148. 'id' 'ids' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an'
  149. 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado'
  150. 'ido' 'iendo' 'i{o'}' 'ar' 'ir' 'as'
  151. 'ieu' 'ii' 'io' 'i{a`}'
  152. 'ess' 'essin' 'essis' 'ass' 'assin' 'assis' 'essim' '{e`}ssim' '{e`}ssiu'
  153. 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases'
  154. 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais'
  155. 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados'
  156. 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' 'ques'
  157. '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos'
  158. 'ira' 'iran' 'irem' 'iren' 'ires' 'ireu' 'iria' 'irien'
  159. 'iries' 'ir{a`}' 'ir{a`}s' 'ir{e`}' 'ir{i`}em' 'ir{i`}eu'
  160. 'isquen' 'iguem' 'igueu' 'esqui' 'esquin' 'esquis' 'eixi' 'eixin' 'eixis'
  161. 'eixen' 'eixo' 'isin' 'isis' 'esques' 'sis' 'sin'
  162. 'int' 'ir{i'}em' 'ir{i'}eu' 'isc' 'atges' 'esca' 'esquen'
  163. 'issen' 'isses' 'issin' 'issis' 'isca' 'issiu' 'issim'
  164. '{i"}sc' '{i"}sca' '{i"}ssin' '{i'}ssiu' '{i'}ssim' '{i"}ssis' '{i"}guem' '{i"}gueu'
  165. '{i"}ra' '{i"}ren' '{i"}res'
  166. '{i"}squen' '{i"}sques' '{i"}ssen' '{i"}sses' '{i"}xo' '{i"}xen' '{i"}xes' '{i"}x'
  167. 'ixo' 'ixen' 'ixes' 'ix' 'ixa' 'inin' 'inis' 'ini' 'ineu' 'itza' 'itzi' 'itzeu' 'itzis'
  168. 'itzo' 'itz' 'itz{a`}' 'arem' 'in' '{a`}s' 'i{i"}' 'i{i"}n' 'i{i"}s'
  169. (R1 delete)
  170. 'ando'
  171. (R2 delete)
  172. )
  173. )
  174.  
  175. define residual_suffix as (
  176. [substring] among(
  177. 'os' 'a' 'o' '{a'}' '{a`}' '{i'}' '{o'}' 'e' '{e'}' 'eu' 'iu'
  178. 'is' 'i' 'ir' 's' '{i`}' 'itz' '{i"}' '{i"}n' '{i"}s' 'it'
  179. (R1 delete)
  180. 'iqu'
  181. (R1 <- 'ic')
  182. )
  183. )
  184. )
  185.  
  186. define stem as (
  187. do mark_regions
  188. backwards (
  189. do attached_pronoun
  190. do ( standard_suffix or
  191. verb_suffix
  192. )
  193. do residual_suffix
  194. )
  195. do cleaning
  196. )
  197. /*
  198. First works 2010/07/19
  199. First Gramatical Reviews: http://ca.wikipedia.org/wiki/Gram%C3%A0tica_catalana
  200. Sufix list: http://wapedia.mobi/ca/Llista_de_sufixos
  201. Irregular Verbs: http://ca.wikipedia.org/wiki/Flexi%C3%B3_verbal_del_catal%C3%A0
  202. */
user/domenge/stemmer/stemmercatalan.txt · Dernière modification: 2018/11/05 09:53 par domenge
CC Attribution-Share Alike 4.0 International
Driven by DokuWiki Recent changes RSS feed Valid CSS Valid XHTML 1.0