CL
7 years ago
1152 changed files with 284133 additions and 1 deletions
@ -0,0 +1,19 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more |
||||
# contributor license agreements. See the NOTICE file distributed with |
||||
# this work for additional information regarding copyright ownership. |
||||
# The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
# (the "License"); you may not use this file except in compliance with |
||||
# the License. You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
com.fr.third.org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.charfilter.MappingCharFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.fa.PersianCharFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.pattern.PatternReplaceCharFilterFactory |
@ -0,0 +1,92 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more |
||||
# contributor license agreements. See the NOTICE file distributed with |
||||
# this work for additional information regarding copyright ownership. |
||||
# The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
# (the "License"); you may not use this file except in compliance with |
||||
# the License. You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
com.fr.third.org.apache.lucene.analysis.ar.ArabicNormalizationFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.ar.ArabicStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.bg.BulgarianStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.br.BrazilianStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.cjk.CJKBigramFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.cjk.CJKWidthFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.cn.ChineseFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.commongrams.CommonGramsFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.commongrams.CommonGramsQueryFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.core.LowerCaseFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.core.StopFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.core.TypeTokenFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.cz.CzechStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.de.GermanLightStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.de.GermanMinimalStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.de.GermanNormalizationFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.de.GermanStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.el.GreekLowerCaseFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.el.GreekStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.en.EnglishMinimalStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.en.EnglishPossessiveFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.en.KStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.en.PorterStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.es.SpanishLightStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.fr.FrenchMinimalStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.ga.IrishLowerCaseFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.gl.GalicianMinimalStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.gl.GalicianStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.hi.HindiNormalizationFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.hi.HindiStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.hu.HungarianLightStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.hunspell.HunspellStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.id.IndonesianStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.in.IndicNormalizationFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.it.ItalianLightStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.lv.LatvianStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.miscellaneous.LengthFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.miscellaneous.TrimFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.ngram.EdgeNGramFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.ngram.NGramFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.no.NorwegianLightStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.no.NorwegianMinimalStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.payloads.NumericPayloadTokenFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.payloads.TokenOffsetPayloadTokenFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.position.PositionFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.pt.PortugueseLightStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.pt.PortugueseMinimalStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.pt.PortugueseStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.reverse.ReverseStringFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.ru.RussianLightStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.shingle.ShingleFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.standard.ClassicFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.standard.StandardFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.synonym.SynonymFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.th.ThaiWordFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.tr.TurkishLowerCaseFilterFactory |
||||
com.fr.third.org.apache.lucene.analysis.util.ElisionFilterFactory |
||||
com.fr.third.org.apache.lucene.collation.CollationKeyFilterFactory |
@ -0,0 +1,31 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more |
||||
# contributor license agreements. See the NOTICE file distributed with |
||||
# this work for additional information regarding copyright ownership. |
||||
# The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
# (the "License"); you may not use this file except in compliance with |
||||
# the License. You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
com.fr.third.org.apache.lucene.analysis.ar.ArabicLetterTokenizerFactory |
||||
com.fr.third.org.apache.lucene.analysis.cjk.CJKTokenizerFactory |
||||
com.fr.third.org.apache.lucene.analysis.cn.ChineseTokenizerFactory |
||||
com.fr.third.org.apache.lucene.analysis.core.KeywordTokenizerFactory |
||||
com.fr.third.org.apache.lucene.analysis.core.LetterTokenizerFactory |
||||
com.fr.third.org.apache.lucene.analysis.core.LowerCaseTokenizerFactory |
||||
com.fr.third.org.apache.lucene.analysis.core.WhitespaceTokenizerFactory |
||||
com.fr.third.org.apache.lucene.analysis.ngram.EdgeNGramTokenizerFactory |
||||
com.fr.third.org.apache.lucene.analysis.ngram.NGramTokenizerFactory |
||||
com.fr.third.org.apache.lucene.analysis.path.PathHierarchyTokenizerFactory |
||||
com.fr.third.org.apache.lucene.analysis.pattern.PatternTokenizerFactory |
||||
com.fr.third.org.apache.lucene.analysis.ru.RussianLetterTokenizerFactory |
||||
com.fr.third.org.apache.lucene.analysis.standard.ClassicTokenizerFactory |
||||
com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizerFactory |
||||
com.fr.third.org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory |
||||
com.fr.third.org.apache.lucene.analysis.wikipedia.WikipediaTokenizerFactory |
@ -0,0 +1,17 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more |
||||
# contributor license agreements. See the NOTICE file distributed with |
||||
# this work for additional information regarding copyright ownership. |
||||
# The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
# (the "License"); you may not use this file except in compliance with |
||||
# the License. You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
com.fr.third.org.apache.lucene.codecs.lucene40.Lucene40Codec |
||||
com.fr.third.org.apache.lucene.codecs.lucene3x.Lucene3xCodec |
@ -0,0 +1,16 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more |
||||
# contributor license agreements. See the NOTICE file distributed with |
||||
# this work for additional information regarding copyright ownership. |
||||
# The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
# (the "License"); you may not use this file except in compliance with |
||||
# the License. You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
com.fr.third.org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat |
@ -0,0 +1,125 @@
|
||||
# This file was created by Jacques Savoy and is distributed under the BSD license. |
||||
# See http://members.unine.ch/jacques.savoy/clef/index.html. |
||||
# Also see http://www.opensource.org/licenses/bsd-license.html |
||||
# Cleaned on October 11, 2009 (not normalized, so use before normalization) |
||||
# This means that when modifying this list, you might need to add some |
||||
# redundant entries, for example containing forms with both أ and ا |
||||
من |
||||
ومن |
||||
منها |
||||
منه |
||||
في |
||||
وفي |
||||
فيها |
||||
فيه |
||||
و |
||||
ف |
||||
ثم |
||||
او |
||||
أو |
||||
ب |
||||
بها |
||||
به |
||||
ا |
||||
أ |
||||
اى |
||||
اي |
||||
أي |
||||
أى |
||||
لا |
||||
ولا |
||||
الا |
||||
ألا |
||||
إلا |
||||
لكن |
||||
ما |
||||
وما |
||||
كما |
||||
فما |
||||
عن |
||||
مع |
||||
اذا |
||||
إذا |
||||
ان |
||||
أن |
||||
إن |
||||
انها |
||||
أنها |
||||
إنها |
||||
انه |
||||
أنه |
||||
إنه |
||||
بان |
||||
بأن |
||||
فان |
||||
فأن |
||||
وان |
||||
وأن |
||||
وإن |
||||
التى |
||||
التي |
||||
الذى |
||||
الذي |
||||
الذين |
||||
الى |
||||
الي |
||||
إلى |
||||
إلي |
||||
على |
||||
عليها |
||||
عليه |
||||
اما |
||||
أما |
||||
إما |
||||
ايضا |
||||
أيضا |
||||
كل |
||||
وكل |
||||
لم |
||||
ولم |
||||
لن |
||||
ولن |
||||
هى |
||||
هي |
||||
هو |
||||
وهى |
||||
وهي |
||||
وهو |
||||
فهى |
||||
فهي |
||||
فهو |
||||
انت |
||||
أنت |
||||
لك |
||||
لها |
||||
له |
||||
هذه |
||||
هذا |
||||
تلك |
||||
ذلك |
||||
هناك |
||||
كانت |
||||
كان |
||||
يكون |
||||
تكون |
||||
وكانت |
||||
وكان |
||||
غير |
||||
بعض |
||||
قد |
||||
نحو |
||||
بين |
||||
بينما |
||||
منذ |
||||
ضمن |
||||
حيث |
||||
الان |
||||
الآن |
||||
خلال |
||||
بعد |
||||
قبل |
||||
حتى |
||||
عند |
||||
عندما |
||||
لدى |
||||
جميع |
@ -0,0 +1,193 @@
|
||||
# This file was created by Jacques Savoy and is distributed under the BSD license. |
||||
# See http://members.unine.ch/jacques.savoy/clef/index.html. |
||||
# Also see http://www.opensource.org/licenses/bsd-license.html |
||||
а |
||||
аз |
||||
ако |
||||
ала |
||||
бе |
||||
без |
||||
беше |
||||
би |
||||
бил |
||||
била |
||||
били |
||||
било |
||||
близо |
||||
бъдат |
||||
бъде |
||||
бяха |
||||
в |
||||
вас |
||||
ваш |
||||
ваша |
||||
вероятно |
||||
вече |
||||
взема |
||||
ви |
||||
вие |
||||
винаги |
||||
все |
||||
всеки |
||||
всички |
||||
всичко |
||||
всяка |
||||
във |
||||
въпреки |
||||
върху |
||||
г |
||||
ги |
||||
главно |
||||
го |
||||
д |
||||
да |
||||
дали |
||||
до |
||||
докато |
||||
докога |
||||
дори |
||||
досега |
||||
доста |
||||
е |
||||
едва |
||||
един |
||||
ето |
||||
за |
||||
зад |
||||
заедно |
||||
заради |
||||
засега |
||||
затова |
||||
защо |
||||
защото |
||||
и |
||||
из |
||||
или |
||||
им |
||||
има |
||||
имат |
||||
иска |
||||
й |
||||
каза |
||||
как |
||||
каква |
||||
какво |
||||
както |
||||
какъв |
||||
като |
||||
кога |
||||
когато |
||||
което |
||||
които |
||||
кой |
||||
който |
||||
колко |
||||
която |
||||
къде |
||||
където |
||||
към |
||||
ли |
||||
м |
||||
ме |
||||
между |
||||
мен |
||||
ми |
||||
мнозина |
||||
мога |
||||
могат |
||||
може |
||||
моля |
||||
момента |
||||
му |
||||
н |
||||
на |
||||
над |
||||
назад |
||||
най |
||||
направи |
||||
напред |
||||
например |
||||
нас |
||||
не |
||||
него |
||||
нея |
||||
ни |
||||
ние |
||||
никой |
||||
нито |
||||
но |
||||
някои |
||||
някой |
||||
няма |
||||
обаче |
||||
около |
||||
освен |
||||
особено |
||||
от |
||||
отгоре |
||||
отново |
||||
още |
||||
пак |
||||
по |
||||
повече |
||||
повечето |
||||
под |
||||
поне |
||||
поради |
||||
после |
||||
почти |
||||
прави |
||||
пред |
||||
преди |
||||
през |
||||
при |
||||
пък |
||||
първо |
||||
с |
||||
са |
||||
само |
||||
се |
||||
сега |
||||
си |
||||
скоро |
||||
след |
||||
сме |
||||
според |
||||
сред |
||||
срещу |
||||
сте |
||||
съм |
||||
със |
||||
също |
||||
т |
||||
тази |
||||
така |
||||
такива |
||||
такъв |
||||
там |
||||
твой |
||||
те |
||||
тези |
||||
ти |
||||
тн |
||||
то |
||||
това |
||||
тогава |
||||
този |
||||
той |
||||
толкова |
||||
точно |
||||
трябва |
||||
тук |
||||
тъй |
||||
тя |
||||
тях |
||||
у |
||||
харесва |
||||
ч |
||||
че |
||||
често |
||||
чрез |
||||
ще |
||||
щом |
||||
я |
@ -0,0 +1,128 @@
|
||||
a |
||||
ainda |
||||
alem |
||||
ambas |
||||
ambos |
||||
antes |
||||
ao |
||||
aonde |
||||
aos |
||||
apos |
||||
aquele |
||||
aqueles |
||||
as |
||||
assim |
||||
com |
||||
como |
||||
contra |
||||
contudo |
||||
cuja |
||||
cujas |
||||
cujo |
||||
cujos |
||||
da |
||||
das |
||||
de |
||||
dela |
||||
dele |
||||
deles |
||||
demais |
||||
depois |
||||
desde |
||||
desta |
||||
deste |
||||
dispoe |
||||
dispoem |
||||
diversa |
||||
diversas |
||||
diversos |
||||
do |
||||
dos |
||||
durante |
||||
e |
||||
ela |
||||
elas |
||||
ele |
||||
eles |
||||
em |
||||
entao |
||||
entre |
||||
essa |
||||
essas |
||||
esse |
||||
esses |
||||
esta |
||||
estas |
||||
este |
||||
estes |
||||
ha |
||||
isso |
||||
isto |
||||
logo |
||||
mais |
||||
mas |
||||
mediante |
||||
menos |
||||
mesma |
||||
mesmas |
||||
mesmo |
||||
mesmos |
||||
na |
||||
nas |
||||
nao |
||||
nas |
||||
nem |
||||
nesse |
||||
neste |
||||
nos |
||||
o |
||||
os |
||||
ou |
||||
outra |
||||
outras |
||||
outro |
||||
outros |
||||
pelas |
||||
pelas |
||||
pelo |
||||
pelos |
||||
perante |
||||
pois |
||||
por |
||||
porque |
||||
portanto |
||||
proprio |
||||
propios |
||||
quais |
||||
qual |
||||
qualquer |
||||
quando |
||||
quanto |
||||
que |
||||
quem |
||||
quer |
||||
se |
||||
seja |
||||
sem |
||||
sendo |
||||
seu |
||||
seus |
||||
sob |
||||
sobre |
||||
sua |
||||
suas |
||||
tal |
||||
tambem |
||||
teu |
||||
teus |
||||
toda |
||||
todas |
||||
todo |
||||
todos |
||||
tua |
||||
tuas |
||||
tudo |
||||
um |
||||
uma |
||||
umas |
||||
uns |
@ -0,0 +1,220 @@
|
||||
# Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed) |
||||
a |
||||
abans |
||||
ací |
||||
ah |
||||
així |
||||
això |
||||
al |
||||
als |
||||
aleshores |
||||
algun |
||||
alguna |
||||
algunes |
||||
alguns |
||||
alhora |
||||
allà |
||||
allí |
||||
allò |
||||
altra |
||||
altre |
||||
altres |
||||
amb |
||||
ambdós |
||||
ambdues |
||||
apa |
||||
aquell |
||||
aquella |
||||
aquelles |
||||
aquells |
||||
aquest |
||||
aquesta |
||||
aquestes |
||||
aquests |
||||
aquí |
||||
baix |
||||
cada |
||||
cadascú |
||||
cadascuna |
||||
cadascunes |
||||
cadascuns |
||||
com |
||||
contra |
||||
d'un |
||||
d'una |
||||
d'unes |
||||
d'uns |
||||
dalt |
||||
de |
||||
del |
||||
dels |
||||
des |
||||
després |
||||
dins |
||||
dintre |
||||
donat |
||||
doncs |
||||
durant |
||||
e |
||||
eh |
||||
el |
||||
els |
||||
em |
||||
en |
||||
encara |
||||
ens |
||||
entre |
||||
érem |
||||
eren |
||||
éreu |
||||
es |
||||
és |
||||
esta |
||||
està |
||||
estàvem |
||||
estaven |
||||
estàveu |
||||
esteu |
||||
et |
||||
etc |
||||
ets |
||||
fins |
||||
fora |
||||
gairebé |
||||
ha |
||||
han |
||||
has |
||||
havia |
||||
he |
||||
hem |
||||
heu |
||||
hi |
||||
ho |
||||
i |
||||
igual |
||||
iguals |
||||
ja |
||||
l'hi |
||||
la |
||||
les |
||||
li |
||||
li'n |
||||
llavors |
||||
m'he |
||||
ma |
||||
mal |
||||
malgrat |
||||
mateix |
||||
mateixa |
||||
mateixes |
||||
mateixos |
||||
me |
||||
mentre |
||||
més |
||||
meu |
||||
meus |
||||
meva |
||||
meves |
||||
molt |
||||
molta |
||||
moltes |
||||
molts |
||||
mon |
||||
mons |
||||
n'he |
||||
n'hi |
||||
ne |
||||
ni |
||||
no |
||||
nogensmenys |
||||
només |
||||
nosaltres |
||||
nostra |
||||
nostre |
||||
nostres |
||||
o |
||||
oh |
||||
oi |
||||
on |
||||
pas |
||||
pel |
||||
pels |
||||
per |
||||
però |
||||
perquè |
||||
poc |
||||
poca |
||||
pocs |
||||
poques |
||||
potser |
||||
propi |
||||
qual |
||||
quals |
||||
quan |
||||
quant |
||||
que |
||||
què |
||||
quelcom |
||||
qui |
||||
quin |
||||
quina |
||||
quines |
||||
quins |
||||
s'ha |
||||
s'han |
||||
sa |
||||
semblant |
||||
semblants |
||||
ses |
||||
seu |
||||
seus |
||||
seva |
||||
seva |
||||
seves |
||||
si |
||||
sobre |
||||
sobretot |
||||
sóc |
||||
solament |
||||
sols |
||||
son |
||||
són |
||||
sons |
||||
sota |
||||
sou |
||||
t'ha |
||||
t'han |
||||
t'he |
||||
ta |
||||
tal |
||||
també |
||||
tampoc |
||||
tan |
||||
tant |
||||
tanta |
||||
tantes |
||||
teu |
||||
teus |
||||
teva |
||||
teves |
||||
ton |
||||
tons |
||||
tot |
||||
tota |
||||
totes |
||||
tots |
||||
un |
||||
una |
||||
unes |
||||
uns |
||||
us |
||||
va |
||||
vaig |
||||
vam |
||||
van |
||||
vas |
||||
veu |
||||
vosaltres |
||||
vostra |
||||
vostre |
||||
vostres |
@ -0,0 +1,35 @@
|
||||
a |
||||
and |
||||
are |
||||
as |
||||
at |
||||
be |
||||
but |
||||
by |
||||
for |
||||
if |
||||
in |
||||
into |
||||
is |
||||
it |
||||
no |
||||
not |
||||
of |
||||
on |
||||
or |
||||
s |
||||
such |
||||
t |
||||
that |
||||
the |
||||
their |
||||
then |
||||
there |
||||
these |
||||
they |
||||
this |
||||
to |
||||
was |
||||
will |
||||
with |
||||
www |
@ -0,0 +1,67 @@
|
||||
<?xml version="1.0" encoding="US-ASCII"?> |
||||
<!-- |
||||
Copyright 1999-2004 The Apache Software Foundation |
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); |
||||
you may not use this file except in compliance with the License. |
||||
You may obtain a copy of the License at |
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
||||
Unless required by applicable law or agreed to in writing, software |
||||
distributed under the License is distributed on an "AS IS" BASIS, |
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
See the License for the specific language governing permissions and |
||||
limitations under the License. |
||||
--> |
||||
|
||||
<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?, |
||||
classes, exceptions?, patterns)> |
||||
|
||||
<!-- Hyphen character to be used in the exception list as shortcut for |
||||
<hyphen pre-break="-"/>. Defaults to '-' |
||||
--> |
||||
<!ELEMENT hyphen-char EMPTY> |
||||
<!ATTLIST hyphen-char value CDATA #REQUIRED> |
||||
|
||||
<!-- Default minimun length in characters of hyphenated word fragments |
||||
before and after the line break. For some languages this is not |
||||
only for aesthetic purposes, wrong hyphens may be generated if this |
||||
is not accounted for. |
||||
--> |
||||
<!ELEMENT hyphen-min EMPTY> |
||||
<!ATTLIST hyphen-min before CDATA #REQUIRED> |
||||
<!ATTLIST hyphen-min after CDATA #REQUIRED> |
||||
|
||||
<!-- Character equivalent classes: space separated list of character groups, all |
||||
characters in a group are to be treated equivalent as far as |
||||
the hyphenation algorithm is concerned. The first character in a group |
||||
is the group's equivalent character. Patterns should only contain |
||||
first characters. It also defines word characters, i.e. a word that |
||||
contains characters not present in any of the classes is not hyphenated. |
||||
--> |
||||
<!ELEMENT classes (#PCDATA)> |
||||
|
||||
<!-- Hyphenation exceptions: space separated list of hyphenated words. |
||||
A hyphen is indicated by the hyphen tag, but you can use the |
||||
hyphen-char defined previously as shortcut. This is in cases |
||||
when the algorithm procedure finds wrong hyphens or you want |
||||
to provide your own hyphenation for some words. |
||||
--> |
||||
<!ELEMENT exceptions (#PCDATA|hyphen)* > |
||||
|
||||
<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent' |
||||
characters as described before, between any two word characters a digit |
||||
in the range 0 to 9 may be specified. The absence of a digit is equivalent |
||||
to zero. The '.' character is reserved to indicate begining or ending |
||||
of words. --> |
||||
<!ELEMENT patterns (#PCDATA)> |
||||
|
||||
<!-- A "full hyphen" equivalent to TeX's \discretionary |
||||
with pre-break, post-break and no-break attributes. |
||||
To be used in the exceptions list, the hyphen character is not |
||||
automatically added --> |
||||
<!ELEMENT hyphen EMPTY> |
||||
<!ATTLIST hyphen pre CDATA #IMPLIED> |
||||
<!ATTLIST hyphen no CDATA #IMPLIED> |
||||
<!ATTLIST hyphen post CDATA #IMPLIED> |
@ -0,0 +1,172 @@
|
||||
a |
||||
s |
||||
k |
||||
o |
||||
i |
||||
u |
||||
v |
||||
z |
||||
dnes |
||||
cz |
||||
tímto |
||||
budeš |
||||
budem |
||||
byli |
||||
jseš |
||||
můj |
||||
svým |
||||
ta |
||||
tomto |
||||
tohle |
||||
tuto |
||||
tyto |
||||
jej |
||||
zda |
||||
proč |
||||
máte |
||||
tato |
||||
kam |
||||
tohoto |
||||
kdo |
||||
kteří |
||||
mi |
||||
nám |
||||
tom |
||||
tomuto |
||||
mít |
||||
nic |
||||
proto |
||||
kterou |
||||
byla |
||||
toho |
||||
protože |
||||
asi |
||||
ho |
||||
naši |
||||
napište |
||||
re |
||||
což |
||||
tím |
||||
takže |
||||
svých |
||||
její |
||||
svými |
||||
jste |
||||
aj |
||||
tu |
||||
tedy |
||||
teto |
||||
bylo |
||||
kde |
||||
ke |
||||
pravé |
||||
ji |
||||
nad |
||||
nejsou |
||||
či |
||||
pod |
||||
téma |
||||
mezi |
||||
přes |
||||
ty |
||||
pak |
||||
vám |
||||
ani |
||||
když |
||||
však |
||||
neg |
||||
jsem |
||||
tento |
||||
článku |
||||
články |
||||
aby |
||||
jsme |
||||
před |
||||
pta |
||||
jejich |
||||
byl |
||||
ještě |
||||
až |
||||
bez |
||||
také |
||||
pouze |
||||
první |
||||
vaše |
||||
která |
||||
nás |
||||
nový |
||||
tipy |
||||
pokud |
||||
může |
||||
strana |
||||
jeho |
||||
své |
||||
jiné |
||||
zprávy |
||||
nové |
||||
není |
||||
vás |
||||
jen |
||||
podle |
||||
zde |
||||
už |
||||
být |
||||
více |
||||
bude |
||||
již |
||||
než |
||||
který |
||||
by |
||||
které |
||||
co |
||||
nebo |
||||
ten |
||||
tak |
||||
má |
||||
při |
||||
od |
||||
po |
||||
jsou |
||||
jak |
||||
další |
||||
ale |
||||
si |
||||
se |
||||
ve |
||||
to |
||||
jako |
||||
za |
||||
zpět |
||||
ze |
||||
do |
||||
pro |
||||
je |
||||
na |
||||
atd |
||||
atp |
||||
jakmile |
||||
přičemž |
||||
já |
||||
on |
||||
ona |
||||
ono |
||||
oni |
||||
ony |
||||
my |
||||
vy |
||||
jí |
||||
ji |
||||
mě |
||||
mne |
||||
jemu |
||||
tomu |
||||
těm |
||||
těmu |
||||
němu |
||||
němuž |
||||
jehož |
||||
jíž |
||||
jelikož |
||||
jež |
||||
jakož |
||||
načež |
@ -0,0 +1,78 @@
|
||||
# Lucene Greek Stopwords list |
||||
# Note: by default this file is used after GreekLowerCaseFilter, |
||||
# so when modifying this file use 'σ' instead of 'ς' |
||||
ο |
||||
η |
||||
το |
||||
οι |
||||
τα |
||||
του |
||||
τησ |
||||
των |
||||
τον |
||||
την |
||||
και |
||||
κι |
||||
κ |
||||
ειμαι |
||||
εισαι |
||||
ειναι |
||||
ειμαστε |
||||
ειστε |
||||
στο |
||||
στον |
||||
στη |
||||
στην |
||||
μα |
||||
αλλα |
||||
απο |
||||
για |
||||
προσ |
||||
με |
||||
σε |
||||
ωσ |
||||
παρα |
||||
αντι |
||||
κατα |
||||
μετα |
||||
θα |
||||
να |
||||
δε |
||||
δεν |
||||
μη |
||||
μην |
||||
επι |
||||
ενω |
||||
εαν |
||||
αν |
||||
τοτε |
||||
που |
||||
πωσ |
||||
ποιοσ |
||||
ποια |
||||
ποιο |
||||
ποιοι |
||||
ποιεσ |
||||
ποιων |
||||
ποιουσ |
||||
αυτοσ |
||||
αυτη |
||||
αυτο |
||||
αυτοι |
||||
αυτων |
||||
αυτουσ |
||||
αυτεσ |
||||
αυτα |
||||
εκεινοσ |
||||
εκεινη |
||||
εκεινο |
||||
εκεινοι |
||||
εκεινεσ |
||||
εκεινα |
||||
εκεινων |
||||
εκεινουσ |
||||
οπωσ |
||||
ομωσ |
||||
ισωσ |
||||
οσο |
||||
οτι |
@ -0,0 +1,99 @@
|
||||
# example set of basque stopwords |
||||
al |
||||
anitz |
||||
arabera |
||||
asko |
||||
baina |
||||
bat |
||||
batean |
||||
batek |
||||
bati |
||||
batzuei |
||||
batzuek |
||||
batzuetan |
||||
batzuk |
||||
bera |
||||
beraiek |
||||
berau |
||||
berauek |
||||
bere |
||||
berori |
||||
beroriek |
||||
beste |
||||
bezala |
||||
da |
||||
dago |
||||
dira |
||||
ditu |
||||
du |
||||
dute |
||||
edo |
||||
egin |
||||
ere |
||||
eta |
||||
eurak |
||||
ez |
||||
gainera |
||||
gu |
||||
gutxi |
||||
guzti |
||||
haiei |
||||
haiek |
||||
haietan |
||||
hainbeste |
||||
hala |
||||
han |
||||
handik |
||||
hango |
||||
hara |
||||
hari |
||||
hark |
||||
hartan |
||||
hau |
||||
hauei |
||||
hauek |
||||
hauetan |
||||
hemen |
||||
hemendik |
||||
hemengo |
||||
hi |
||||
hona |
||||
honek |
||||
honela |
||||
honetan |
||||
honi |
||||
hor |
||||
hori |
||||
horiei |
||||
horiek |
||||
horietan |
||||
horko |
||||
horra |
||||
horrek |
||||
horrela |
||||
horretan |
||||
horri |
||||
hortik |
||||
hura |
||||
izan |
||||
ni |
||||
noiz |
||||
nola |
||||
non |
||||
nondik |
||||
nongo |
||||
nor |
||||
nora |
||||
ze |
||||
zein |
||||
zen |
||||
zenbait |
||||
zenbat |
||||
zer |
||||
zergatik |
||||
ziren |
||||
zituen |
||||
zu |
||||
zuek |
||||
zuen |
||||
zuten |
@ -0,0 +1,313 @@
|
||||
# This file was created by Jacques Savoy and is distributed under the BSD license. |
||||
# See http://members.unine.ch/jacques.savoy/clef/index.html. |
||||
# Also see http://www.opensource.org/licenses/bsd-license.html |
||||
# Note: by default this file is used after normalization, so when adding entries |
||||
# to this file, use the arabic 'ي' instead of 'ی' |
||||
انان |
||||
نداشته |
||||
سراسر |
||||
خياه |
||||
ايشان |
||||
وي |
||||
تاكنون |
||||
بيشتري |
||||
دوم |
||||
پس |
||||
ناشي |
||||
وگو |
||||
يا |
||||
داشتند |
||||
سپس |
||||
هنگام |
||||
هرگز |
||||
پنج |
||||
نشان |
||||
امسال |
||||
ديگر |
||||
گروهي |
||||
شدند |
||||
چطور |
||||
ده |
||||
و |
||||
دو |
||||
نخستين |
||||
ولي |
||||
چرا |
||||
چه |
||||
وسط |
||||
ه |
||||
كدام |
||||
قابل |
||||
يك |
||||
رفت |
||||
هفت |
||||
همچنين |
||||
در |
||||
هزار |
||||
بله |
||||
بلي |
||||
شايد |
||||
اما |
||||
شناسي |
||||
گرفته |
||||
دهد |
||||
داشته |
||||
دانست |
||||
داشتن |
||||
خواهيم |
||||
ميليارد |
||||
وقتيكه |
||||
امد |
||||
خواهد |
||||
جز |
||||
اورده |
||||
شده |
||||
بلكه |
||||
خدمات |
||||
شدن |
||||
برخي |
||||
نبود |
||||
بسياري |
||||
جلوگيري |
||||
حق |
||||
كردند |
||||
نوعي |
||||
بعري |
||||
نكرده |
||||
نظير |
||||
نبايد |
||||
بوده |
||||
بودن |
||||
داد |
||||
اورد |
||||
هست |
||||
جايي |
||||
شود |
||||
دنبال |
||||
داده |
||||
بايد |
||||
سابق |
||||
هيچ |
||||
همان |
||||
انجا |
||||
كمتر |
||||
كجاست |
||||
گردد |
||||
كسي |
||||
تر |
||||
مردم |
||||
تان |
||||
دادن |
||||
بودند |
||||
سري |
||||
جدا |
||||
ندارند |
||||
مگر |
||||
يكديگر |
||||
دارد |
||||
دهند |
||||
بنابراين |
||||
هنگامي |
||||
سمت |
||||
جا |
||||
انچه |
||||
خود |
||||
دادند |
||||
زياد |
||||
دارند |
||||
اثر |
||||
بدون |
||||
بهترين |
||||
بيشتر |
||||
البته |
||||
به |
||||
براساس |
||||
بيرون |
||||
كرد |
||||
بعضي |
||||
گرفت |
||||
توي |
||||
اي |
||||
ميليون |
||||
او |
||||
جريان |
||||
تول |
||||
بر |
||||
مانند |
||||
برابر |
||||
باشيم |
||||
مدتي |
||||
گويند |
||||
اكنون |
||||
تا |
||||
تنها |
||||
جديد |
||||
چند |
||||
بي |
||||
نشده |
||||
كردن |
||||
كردم |
||||
گويد |
||||
كرده |
||||
كنيم |
||||
نمي |
||||
نزد |
||||
روي |
||||
قصد |
||||
فقط |
||||
بالاي |
||||
ديگران |
||||
اين |
||||
ديروز |
||||
توسط |
||||
سوم |
||||
ايم |
||||
دانند |
||||
سوي |
||||
استفاده |
||||
شما |
||||
كنار |
||||
داريم |
||||
ساخته |
||||
طور |
||||
امده |
||||
رفته |
||||
نخست |
||||
بيست |
||||
نزديك |
||||
طي |
||||
كنيد |
||||
از |
||||
انها |
||||
تمامي |
||||
داشت |
||||
يكي |
||||
طريق |
||||
اش |
||||
چيست |
||||
روب |
||||
نمايد |
||||
گفت |
||||
چندين |
||||
چيزي |
||||
تواند |
||||
ام |
||||
ايا |
||||
با |
||||
ان |
||||
ايد |
||||
ترين |
||||
اينكه |
||||
ديگري |
||||
راه |
||||
هايي |
||||
بروز |
||||
همچنان |
||||
پاعين |
||||
كس |
||||
حدود |
||||
مختلف |
||||
مقابل |
||||
چيز |
||||
گيرد |
||||
ندارد |
||||
ضد |
||||
همچون |
||||
سازي |
||||
شان |
||||
مورد |
||||
باره |
||||
مرسي |
||||
خويش |
||||
برخوردار |
||||
چون |
||||
خارج |
||||
شش |
||||
هنوز |
||||
تحت |
||||
ضمن |
||||
هستيم |
||||
گفته |
||||
فكر |
||||
بسيار |
||||
پيش |
||||
براي |
||||
روزهاي |
||||
انكه |
||||
نخواهد |
||||
بالا |
||||
كل |
||||
وقتي |
||||
كي |
||||
چنين |
||||
كه |
||||
گيري |
||||
نيست |
||||
است |
||||
كجا |
||||
كند |
||||
نيز |
||||
يابد |
||||
بندي |
||||
حتي |
||||
توانند |
||||
عقب |
||||
خواست |
||||
كنند |
||||
بين |
||||
تمام |
||||
همه |
||||
ما |
||||
باشند |
||||
مثل |
||||
شد |
||||
اري |
||||
باشد |
||||
اره |
||||
طبق |
||||
بعد |
||||
اگر |
||||
صورت |
||||
غير |
||||
جاي |
||||
بيش |
||||
ريزي |
||||
اند |
||||
زيرا |
||||
چگونه |
||||
بار |
||||
لطفا |
||||
مي |
||||
درباره |
||||
من |
||||
ديده |
||||
همين |
||||
گذاري |
||||
برداري |
||||
علت |
||||
گذاشته |
||||
هم |
||||
فوق |
||||
نه |
||||
ها |
||||
شوند |
||||
اباد |
||||
همواره |
||||
هر |
||||
اول |
||||
خواهند |
||||
چهار |
||||
نام |
||||
امروز |
||||
مان |
||||
هاي |
||||
قبل |
||||
كنم |
||||
سعي |
||||
تازه |
||||
را |
||||
هستند |
||||
زير |
||||
جلوي |
||||
عنوان |
||||
بود |
@ -0,0 +1,110 @@
|
||||
|
||||
a |
||||
ach |
||||
ag |
||||
agus |
||||
an |
||||
aon |
||||
ar |
||||
arna |
||||
as |
||||
b' |
||||
ba |
||||
beirt |
||||
bhúr |
||||
caoga |
||||
ceathair |
||||
ceathrar |
||||
chomh |
||||
chtó |
||||
chuig |
||||
chun |
||||
cois |
||||
céad |
||||
cúig |
||||
cúigear |
||||
d' |
||||
daichead |
||||
dar |
||||
de |
||||
deich |
||||
deichniúr |
||||
den |
||||
dhá |
||||
do |
||||
don |
||||
dtí |
||||
dá |
||||
dár |
||||
dó |
||||
faoi |
||||
faoin |
||||
faoina |
||||
faoinár |
||||
fara |
||||
fiche |
||||
gach |
||||
gan |
||||
go |
||||
gur |
||||
haon |
||||
hocht |
||||
i |
||||
iad |
||||
idir |
||||
in |
||||
ina |
||||
ins |
||||
inár |
||||
is |
||||
le |
||||
leis |
||||
lena |
||||
lenár |
||||
m' |
||||
mar |
||||
mo |
||||
mé |
||||
na |
||||
nach |
||||
naoi |
||||
naonúr |
||||
ná |
||||
ní |
||||
níor |
||||
nó |
||||
nócha |
||||
ocht |
||||
ochtar |
||||
os |
||||
roimh |
||||
sa |
||||
seacht |
||||
seachtar |
||||
seachtó |
||||
seasca |
||||
seisear |
||||
siad |
||||
sibh |
||||
sinn |
||||
sna |
||||
sé |
||||
sí |
||||
tar |
||||
thar |
||||
thú |
||||
triúr |
||||
trí |
||||
trína |
||||
trínár |
||||
tríocha |
||||
tú |
||||
um |
||||
ár |
||||
é |
||||
éis |
||||
í |
||||
ó |
||||
ón |
||||
óna |
||||
ónár |
@ -0,0 +1,647 @@
|
||||
# Steps file for the RSLP stemmer. |
||||
|
||||
# Step 1: Plural Reduction |
||||
{ "Plural", 3, 1, {"s"}, |
||||
# bons -> bon |
||||
{"ns",1,"n",{"luns","furatapóns","furatapons"}}, |
||||
# xamós -> xamón |
||||
{"ós",3,"ón"}, |
||||
# balões -> balón |
||||
{"ões",3,"ón"}, |
||||
# capitães -> capitão |
||||
{"ães",1,"ão",{"mães","magalhães"}}, |
||||
# normais -> normal |
||||
{"ais",2,"al",{"cais","tais","mais","pais","ademais"}}, |
||||
{"áis",2,"al",{"cáis","táis", "máis", "páis", "ademáis"}}, |
||||
# papéis -> papel |
||||
{"éis",2,"el"}, |
||||
# posíbeis -> posíbel |
||||
{"eis",2,"el"}, |
||||
# espanhóis -> espanhol |
||||
{"óis",2,"ol",{"escornabóis"}}, |
||||
# caracois -> caracol |
||||
{"ois",2,"ol",{"escornabois"}}, |
||||
# cadrís -> cadril |
||||
{"ís",2,"il",{"país"}}, |
||||
# cadris -> cadril |
||||
{"is",2,"il",{"menfis","pais","kinguis"}}, |
||||
# males -> mal |
||||
{"les",2,"l",{"ingles","marselles","montreales","senegales","manizales","móstoles","nápoles"}}, |
||||
# mares -> mar |
||||
{"res",3,"r",{"petres","henares","cáceres","baleares","linares","londres","mieres","miraflores","mércores","venres", "pires"}}, |
||||
# luces -> luz |
||||
{"ces",2,"z"}, |
||||
# luzes -> luz |
||||
{"zes",2,"z"}, |
||||
# leises -> lei |
||||
{"ises",3,"z"}, |
||||
# animás -> animal |
||||
{"ás",1,"al",{"más"}}, |
||||
# gases -> gas |
||||
{"ses",2,"s"}, |
||||
# casas -> casa |
||||
{"s",2,"",{"barbadés","barcelonés","cantonés","gabonés","llanés","medinés","escocés","escocês","francês","barcelonês","cantonês","macramés","reves","barcelones","cantones","gabones","llanes","magallanes","medines","escoces","frances","xoves","martes","aliás","pires","lápis","cais","mais","mas","menos","férias","pêsames","crúcis","país","cangas","atenas","asturias","canarias","filipinas","honduras","molucas","caldas","mascareñas","micenas","covarrubias","psoas","óculos","nupcias","xoves","martes","llanes"}}}; |
||||
|
||||
{ "Unification", 0, 0, {}, |
||||
# cansadísimo -> cansadísimo |
||||
{"íssimo",5,"ísimo"}, |
||||
# cansadísima -> cansadísima |
||||
{"íssima",5,"ísima"}, |
||||
# homaço -> homazo |
||||
{"aço",4,"azo"}, |
||||
# mulheraça -> mulheraza |
||||
{"aça",4,"aza"}, |
||||
# xentuça -> xentuza |
||||
{"uça",4,"uza"}, |
||||
# manilhar -> manillar |
||||
{"lhar",2,"llar"}, |
||||
# colher -> coller |
||||
{"lher",2,"ller"}, |
||||
# melhor -> mellor |
||||
{"lhor",2,"llor"}, |
||||
# alho -> allo |
||||
{"lho",1,"llo"}, |
||||
# linhar -> liñar |
||||
{"nhar",2,"ñar"}, |
||||
# penhor -> peñor |
||||
{"nhor",2,"ñor"}, |
||||
# anho -> año |
||||
{"nho",1,"ño"}, |
||||
# cunha -> cuña |
||||
{"nha",1,"ña"}, |
||||
# hospitalário -> hospitalario |
||||
{"ário",3,"ario"}, |
||||
# bibliotecária -> bibliotecaria |
||||
{"ária",3,"aria"}, |
||||
# agradable -> agradábel |
||||
{"able",2,"ábel"}, |
||||
# agradávele -> agradábel |
||||
{"ável",2,"ábel"}, |
||||
# imposible -> imposíbel |
||||
{"ible",2,"íbel"}, |
||||
# imposível -> imposíbel |
||||
{"ível",2,"íbel"}, |
||||
# imposiçom -> imposición |
||||
{"çom",2,"ción"}, |
||||
# garagem -> garaxe |
||||
{"agem",2,"axe"}, |
||||
# garage -> garaxe |
||||
{"age",2,"axe"}, |
||||
# impressão -> impressón |
||||
{"ão",3,"ón"}, |
||||
# irmao -> irmán |
||||
{"ao",1,"án"}, |
||||
# irmau -> irmán |
||||
{"au",1,"án"}, |
||||
# garrafom -> garrafón |
||||
{"om",3,"ón"}, |
||||
# cantem -> canten |
||||
{"m",2,"n"}}; |
||||
|
||||
{ "Adverb", 0, 0, {}, |
||||
# felizmente -> feliz |
||||
{"mente",4,"",{"experimente","vehemente","sedimente"}}}; |
||||
|
||||
{ "Augmentative", 0, 1, {}, |
||||
# cansadísimo -> cansad |
||||
{"dísimo",5}, |
||||
# cansadísima -> cansad |
||||
{"dísima",5}, |
||||
# amabilísimo -> ama |
||||
{"bilísimo",3}, |
||||
# amabilísima -> ama |
||||
{"bilísima",3}, |
||||
# fortísimo -> fort |
||||
{"ísimo",3}, |
||||
# fortísima -> fort |
||||
{"ísima",3}, |
||||
# centésimo -> cent |
||||
{"ésimo",3}, |
||||
# centésima -> cent |
||||
{"ésima",3}, |
||||
# paupérrimo -> paup |
||||
{"érrimo",4}, |
||||
# paupérrima -> paup |
||||
{"érrima",4}, |
||||
# charlatana -> charlat |
||||
{"ana",2,"",{"argana","banana","choupana","espadana","faciana","iguana","lantana","macana","membrana","mesana","nirvana","obsidiana","palangana","pavana","persiana","pestana","porcelana","pseudomembrana","roldana","sábana","salangana","saragana","ventana"}}, |
||||
# charlatán -> charlat |
||||
{"án",3,"",{"ademán","bardán","barregán","corricán","curricán","faisán","furacán","fustán","gabán","gabián","galán","gañán","lavacán","mazán","mourán","rabadán","serán","serrán","tabán","titán","tobogán","verán","volcán","volován"}}, |
||||
# homazo -> hom |
||||
{"azo",4,"",{"abrazo","espazo","andazo","bagazo","balazo","bandazo","cachazo","carazo","denazo","engazo","famazo","lampreazo","pantocazo","pedazo","preñazo","regazo","ribazo","sobrazo","terrazo","trompazo"}}, |
||||
# mulleraza -> muller |
||||
{"aza",3,"",{"alcarraza","ameaza","baraza","broucaza","burgaza","cabaza","cachaza","calaza","carpaza","carraza","coiraza","colmaza","fogaza","famaza","labaza","liñaza","melaza","mordaza","paraza","pinaza","rabaza","rapaza","trancaza"}}, |
||||
# cascallo -> casc |
||||
{"allo",4,"",{"traballo"}}, |
||||
# xentalla -> xent |
||||
{"alla",4}, |
||||
# bocarra -> boc |
||||
{"arra",3,"",{"cigarra","cinzarra"}}, |
||||
# medicastro -> medic |
||||
{"astro",3,"",{"balastro","bimbastro","canastro","retropilastro"}}, |
||||
# poetastra -> poet |
||||
{"astra",3,"",{"banastra","canastra","contrapilastra","piastra","pilastra"}}, |
||||
# corpázio -> corp |
||||
{"ázio",3,"",{"topázio"}}, |
||||
# soutelo -> sout |
||||
{"elo",4,"",{"bacelo","barrelo","bicarelo","biquelo","boquelo","botelo","bouquelo","cacarelo","cachelo","cadrelo","campelo","candelo","cantelo","carabelo","carambelo","caramelo","cercelo","cerebelo","chocarelo","coitelo","conchelo","corbelo","cotobelo","couselo","destelo","desvelo","esfácelo","fandelo","fardelo","farelo","farnelo","flabelo","ganchelo","garfelo","involucelo","mantelo","montelo","outerelo","padicelo","pesadelo","pinguelo","piquelo","rampelo","rastrelo","restelo","tornecelo","trabelo","restrelo","portelo","ourelo","zarapelo"}}, |
||||
# avioneta -> avion |
||||
{"eta",3,"",{"arqueta","atleta","avoceta","baioneta","baldeta","banqueta","barraganeta","barreta","borleta","buceta","caceta","calceta","caldeta","cambeta","canaleta","caneta","carreta","cerceta","chaparreta","chapeta","chareta","chincheta","colcheta","cometa","corbeta","corveta","cuneta","desteta","espeta","espoleta","estafeta","esteta","faceta","falanxeta","frasqueta","gaceta","gabeta","galleta","garabeta","gaveta","glorieta","lagareta","lambeta","lanceta","libreta","maceta","macheta","maleta","malleta","mareta","marreta","meseta","mofeta","muleta","peseta","planeta","raqueta","regreta","saqueta","veleta","vendeta","viñeta"}}, |
||||
# guapete -> guap |
||||
{"ete",3,"",{"alfinete","ariete","bacinete","banquete","barallete","barrete","billete","binguelete","birrete","bonete","bosquete","bufete","burlete","cabalete","cacahuete","cavinete","capacete","carrete","casarete","casete","chupete","clarinete","colchete","colete","capete","curupete","disquete","estilete","falsete","ferrete","filete","gallardete","gobelete","inglete","machete","miquelete","molete","mosquete","piquete","ribete","rodete","rolete","roquete","sorvete","vedete","vendete"}}, |
||||
# práctica -> práct |
||||
{"ica",3,"",{"andarica","botánica","botica","dialéctica","dinámica","física","formica","gráfica","marica","túnica"}}, |
||||
# práctico -> práct |
||||
{"ico",3,"",{"conico","acetifico","acidifico"}}, |
||||
# trapexo -> trap |
||||
{"exo",3,"",{"arpexo","arquexo","asexo","axexo","azulexo","badexo","bafexo","bocexo","bosquexo","boubexo","cacarexo","carrexo","cascarexo","castrexo","convexo","cotexo","desexo","despexo","forcexo","gabexo","gargarexo","gorgolexo","inconexo","manexo","merexo","narnexo","padexo","patexo","sopexo","varexo"}}, |
||||
{"exa",3,"",{"airexa","bandexa","carrexa","envexa","igrexa","larexa","patexa","presexa","sobexa"}}, |
||||
# multidão -> mult |
||||
{"idão",3}, |
||||
# pequeniño -> pequeno |
||||
{"iño",3,"o",{"camiño","cariño","comiño","golfiño","padriño","sobriño","viciño","veciño"}}, |
||||
# pequeniña -> pequena |
||||
{"iña",3,"a",{"camariña","campiña","entreliña","espiña","fariña","moriña","valiña"}}, |
||||
# grandito -> grand |
||||
{"ito",3,""}, |
||||
# grandita -> grand |
||||
{"ita",3,""}, |
||||
# anomaloide -> animal |
||||
{"oide",3,"",{"anaroide","aneroide","asteroide","axoide","cardioide","celuloide","coronoide","discoide","espermatozoide","espiroide","esquizoide","esteroide","glenoide","linfoide","hemorroide","melaloide","sacaroide","tetraploide","varioloide"}}, |
||||
# cazola -> caz |
||||
{"ola",3,"",{"aixola","ampola","argola","arola","arteríola","bandola","bítola","bractéola","cachola","carambola","carapola","carola","carrandiola","catrapola","cebola","centola","champola","chatola","cirola","cítola","consola","corola","empola","escarola","esmola","estola","fitola","florícola","garañola","gárgola","garxola","glicocola","góndola","mariola","marola","michola","pirola","rebola","rupícola","saxícola","sémola","tachola","tómbola"}}, |
||||
# pedrolo -> pedr |
||||
{"olo",3,"",{"arrolo","babiolo","cacharolo","caixarolo","carolo","carramolo","cascarolo","cirolo","codrolo","correolo","cotrolo","desconsolo","rebolo","repolo","subsolo","tixolo","tómbolo","torolo","trémolo","vacúolo","xermolo","zócolo"}}, |
||||
# vellote -> vell |
||||
{"ote",3,"",{"aigote","alcaiote","barbarote","balote","billote","cachote","camarote","capote","cebote","chichote","citote","cocorote","escote","gañote","garrote","gavote","lamote","lapote","larapote","lingote","lítote","magote","marrote","matalote","pandote","paparote","rebote","tagarote","zarrote"}}, |
||||
# mozota -> moz |
||||
{"ota",3,"",{"asíntota","caiota","cambota","chacota","compota","creosota","curota","derrota","díspota","gamota","maniota","pelota","picota","pillota","pixota","queirota","remota"}}, |
||||
# gordocho -> gord |
||||
{"cho",3,"",{"abrocho","arrocho","carocho","falucho","bombacho","borracho","mostacho"}}, |
||||
# gordecha -> gord |
||||
{"cha",3,"",{"borracha","carracha","estacha","garnacha","limacha","remolacha","abrocha"}}, |
||||
# baratuco -> barat |
||||
{"uco",4,"",{"caduco","estuco","fachuco","malluco","saluco","trabuco"}}, |
||||
# borrachuzo -> borrach |
||||
{"uzo",3,"",{"carriñouzo","fachuzo","mañuzo","mestruzo","tapuzo"}}, |
||||
# xentuza -> xent |
||||
{"uza",3,"",{"barruza","chamuza","chapuza","charamuza","conduza","deduza","desluza","entreluza","induza","reluza","seduza","traduza","trasluza"}}, |
||||
# babuxa -> bab |
||||
{"uxa",3,"",{"caramuxa","carrabouxa","cartuxa","coruxa","curuxa","gaturuxa","maruxa","meruxa","miruxa","moruxa","muruxa","papuxa","rabuxa","trouxa"}}, |
||||
{"uxo",3,"",{"caramuxo","carouxo","carrabouxo","curuxo","debuxo","ganduxo","influxo","negouxo","pertuxo","refluxo"}}, |
||||
# grupello -> grup |
||||
{"ello",3,"",{"alborello","artello","botello","cachafello","calello","casarello","cazabello","cercello","cocerello","concello","consello","desparello","escaravello","espello","fedello","fervello","gagafello","gorrobello","nortello","pendello","troupello","trebello"}}, |
||||
# pontella -> pont |
||||
{"ella",3,"",{"alborella","bertorella","bocatella","botella","calella","cercella","gadella","grosella","lentella","movella","nocella","noitevella","parella","pelella","percebella","segorella","sabella"}}}; |
||||
|
||||
{ "Noun", 0, 0, {}, |
||||
# lealdade -> leal |
||||
{"dade",3,"",{"acridade","calidade"}}, |
||||
# clarificar -> clar |
||||
{"ificar",2}, |
||||
# brasileiro->brasil |
||||
{"eiro",3,"",{"agoireiro","bardalleiro","braseiro","barreiro","canteiro","capoeiro","carneiro","carteiro","cinceiro","faroleiro","mareiro","preguiceiro","quinteiro","raposeiro","retranqueiro","regueiro","sineiro","troleiro","ventureiro"}}, |
||||
# marisqueira -> marisqu |
||||
{"eira",3,"",{"cabeleira","canteira","cocheira","folleira","milleira"}}, |
||||
# hospitalario -> hospital |
||||
{"ario",3,"",{"armario","calcario","lionario","salario"}}, |
||||
# bibliotecaria -> bibliotec |
||||
{"aria",3,"",{"cetaria","coronaria","fumaria","linaria","lunaria","parietaria","saponaria","serpentaria"}}, |
||||
# humorístico -> humor |
||||
{"ístico",3,"",{"balístico", "ensaístico"}}, |
||||
# castrista -> castr |
||||
{"ista",3,"",{"batista","ciclista","fadista","operista","tenista","verista"}}, |
||||
# lavado -> lav |
||||
{"ado",2,"",{"grado","agrado"}}, |
||||
# decanato -> decan |
||||
{"ato",2,"",{"agnato"}}, |
||||
# xemido -> xem |
||||
{"ido",3,"",{"cándido","cândido","consolido","decidido","duvido","marido","rápido"}}, |
||||
# mantida -> mant |
||||
{"ida",3,"",{"bastida","dúbida","dubida","duvida","ermida","éxida","guarida","lapicida","medida","morida"}}, |
||||
{"ída",3}, |
||||
# mantído -> mant |
||||
{"ido",3}, |
||||
# orelludo -> orell |
||||
{"udo",3,"",{"estudo","escudo"}}, |
||||
# orelluda -> orell |
||||
{"uda",3}, |
||||
{"ada",3,"",{"abada","alhada","allada","pitada"}}, |
||||
# comedela -> come |
||||
{"dela",3,"",{"cambadela","cavadela","forcadela","erisipidela","mortadela","espadela","fondedela","picadela","arandela","candela","cordela","escudela","pardela"}}, |
||||
# fontela -> font |
||||
{"ela",3,"",{"canela","capela","cotela","cubela","curupela","escarapela","esparrela","estela","fardela","flanela","fornela","franela","gabela","gamela","gavela","glumela","granicela","lamela","lapela","malvela","manela","manganela","mexarela","micela","mistela","novela","ourela","panela","parcela","pasarela","patamela","patela","paxarela","pipela","pitela","postela","pubela","restela","sabela","salmonela","secuela","sentinela","soldanela","subela","temoncela","tesela","tixela","tramela","trapela","varela","vitela","xanela","xestela"}}, |
||||
# agradábel -> agrad |
||||
{"ábel",2,"",{"afábel","fiábel"}}, |
||||
# combustíbel -> combust |
||||
{"íbel",2,"",{"críbel","imposíbel","posíbel","fisíbel","falíbel"}}, |
||||
# fabricante -> frabrica |
||||
{"nte",3,"",{"alimente","adiante","acrescente","elefante","frequente","freqüente","gigante","instante","oriente","permanente","posante","possante","restaurante"}}, |
||||
# ignorancia -> ignora |
||||
{"ncia",3}, |
||||
# temperanza -> tempera |
||||
{"nza",3}, |
||||
{"acia",3,"",{"acracia","audacia","falacia","farmacia"}}, |
||||
# inmundicia -> inmund |
||||
{"icia",3,"",{"caricia","delicia","ledicia","malicia","milicia","noticia","pericia","presbicia","primicia","regalicia","sevicia","tiricia"}}, |
||||
# xustiza -> xust |
||||
{"iza",3,"",{"alvariza","baliza","cachiza","caniza","cañiza","carbaliza","carriza","chamariza","chapiza","fraguiza","latiza","longaniza","mañiza","nabiza","peliza","preguiza","rabiza"}}, |
||||
# clarexar -> clar |
||||
{"exar",3,"",{"palmexar"}}, |
||||
# administración -> administr |
||||
{"ación",2,"",{"aeración"}}, |
||||
# expedición -> exped |
||||
{"ición",3,"",{"condición","gornición","monición","nutrición","petición","posición","sedición","volición"}}, |
||||
# excepción -> except |
||||
{"ción",3,"t"}, |
||||
# comprensión -> comprens |
||||
{"sión",3,"s",{"abrasión", "alusión"}}, |
||||
# doazón -> do |
||||
{"azón",2,"",{"armazón"}}, |
||||
# garrafón -> garraf |
||||
{"ón",3,"",{"abalón","acordeón","alción","aldrabón","alerón","aliñón","ambón","bombón","calzón","campón","canalón","cantón","capitón","cañón","centón","ciclón","collón","colofón","copón","cotón","cupón","petón","tirón","tourón","turón","unción","versión","zubón","zurrón"}}, |
||||
# lambona -> lamb |
||||
{"ona",3,"",{"abandona","acetona","aleurona","amazona","anémona","bombona","cambona","carona","chacona","charamona","cincona","condona","cortisona","cretona","cretona","detona","estona","fitohormona","fregona","gerona","hidroquinona","hormona","lesiona","madona","maratona","matrona","metadona","monótona","neurona","pamplona","peptona","poltrona","proxesterona","quinona","quinona","silicona","sulfona"}}, |
||||
# bretoa -> bretón |
||||
{"oa",3,"",{"abandoa","madroa","barbacoa","estoa","airoa","eiroa","amalloa","ámboa","améndoa","anchoa","antinéboa","avéntoa","avoa","bágoa","balboa","bisavoa","boroa","canoa","caroa","comadroa","coroa","éngoa","espácoa","filloa","fírgoa","grañoa","lagoa","lanzoa","magoa","mámoa","morzoa","noiteboa","noraboa","parañoa","persoa","queiroa","rañoa","táboa","tataravoa","teiroa"}}, |
||||
# demoníaco -> demoní |
||||
{"aco",3}, |
||||
# demoníaca -> demoní |
||||
{"aca",3,"",{"alpaca","barraca","bullaca","buraca","carraca","casaca","cavaca","cloaca","entresaca","ervellaca","espinaca","estaca","farraca","millaca","pastinaca","pataca","resaca","urraca","purraca"}}, |
||||
# carballal -> carball |
||||
{"al",4,"",{"afinal","animal","estatal","bisexual","bissexual","desleal","fiscal","formal","pessoal","persoal","liberal","postal","virtual","visual","pontual","puntual","homosexual","heterosexual"}}, |
||||
# nadador -> nada |
||||
{"dor",2,"",{"abaixador"}}, |
||||
# benfeitor -> benfei |
||||
{"tor",3,"",{"autor","motor","pastor","pintor"}}, |
||||
# produtor -> produt |
||||
{"or",2,"",{"asesor","assessor","favor","mellor","melhor","redor","rigor","sensor","tambor","tumor"}}, |
||||
# profesora -> profes |
||||
{"ora",3,"",{"albacora","anáfora","áncora","apisoadora","ardora","ascospora","aurora","avéspora","bitácora","canéfora","cantimplora","catáfora","cepilladora","demora","descalcificadora","diáspora","empacadora","epífora","ecavadora","escora","eslora","espora","fotocompoñedora","fotocopiadora","grampadora","isícora","lavadora","lixadora","macrospora","madrépora","madrágora","masora","mellora","metáfora","microspora","milépora","milpéndora","nécora","oospora","padeadora","pasiflora","pécora","píldora","pólvora","ratinadora","rémora","retroescavadora","sófora","torradora","trémbora","uredospora","víbora","víncora","zoospora"}}, |
||||
# zapataría -> zapat |
||||
{"aría",3,"",{"libraría"}}, |
||||
# etiquetaxe -> etiquet |
||||
{"axe",3,"",{"aluaxe","amaraxe","amperaxe","bagaxe","balaxe","barcaxe","borraxe","bescaxe","cabotaxe","carraxe","cartilaxe","chantaxe","colaxe","coraxe","carruaxe","dragaxe","embalaxe","ensilaxe","epistaxe","fagundaxe","fichaxe","fogaxe","forraxe","fretaxe","friaxe","garaxe","homenaxe","leitaxe","liñaxe","listaxe","maraxe","marcaxe","maridaxe","masaxe","miraxe","montaxe","pasaxe","peaxe","portaxe","ramaxe","rebelaxe","rodaxe","romaxe","sintaxe","sondaxe","tiraxe","vantaxe","vendaxe","viraxe"}}, |
||||
# movedizo -> move |
||||
{"dizo",3}, |
||||
# limpeza -> limp |
||||
{"eza",3,"",{"alteza","beleza","fereza","fineza","vasteza","vileza"}}, |
||||
# rixidez -> rixid |
||||
{"ez",3,"",{"acidez","adultez","adustez","avidez","candidez","mudez","nenez","nudez","pomez"}}, |
||||
# mullerengo -> muller |
||||
{"engo",3}, |
||||
# chairego -> chair |
||||
{"ego",3,"",{"corego","derrego","entrego","lamego","sarego","sartego"}}, |
||||
# cariñoso -> cariñ |
||||
{"oso",3,"",{"afanoso","algoso","caldoso","caloso","cocoso","ditoso","favoso","fogoso","lamoso","mecoso","mocoso","precioso","rixoso","venoso","viroso","xesoso"}}, |
||||
# cariñosa -> cariñ |
||||
{"osa",3,"",{"mucosa","glicosa","baldosa","celulosa","isoglosa","nitrocelulosa","levulosa","ortosa","pectosa","preciosa","sacarosa","serosa","ventosa"}}, |
||||
# negrume -> negr |
||||
{"ume",3,"",{"agrume","albume","alcume","batume","cacume","cerrume","chorume","churume","costume","curtume","estrume","gafume","legume","perfume","queixume","zarrume"}}, |
||||
# altura -> alt |
||||
{"ura",3,"",{"albura","armadura","imatura","costura"}}, |
||||
# cuspiñar -> cusp |
||||
{"iñar",3}, |
||||
# febril -> febr |
||||
{"il",3,"",{"abril","alfil","anil","atril","badil","baril","barril","brasil","cadril","candil","cantil","carril","chamil","chancil","civil","cubil","dátil","difícil","dócil","edil","estéril","fácil","fráxil","funil","fusil","grácil","gradil","hábil","hostil","marfil"}}, |
||||
# principesco -> princip |
||||
{"esco",4}, |
||||
# mourisco -> mour |
||||
{"isco",4}, |
||||
# esportivo -> esport |
||||
{"ivo",3,"",{"pasivo","positivo","passivo","possessivo","posesivo","pexotarivo","relativo"}}}; |
||||
|
||||
{ "Verb", 0, 0, {}, |
||||
# amaba -> am |
||||
{"aba",2}, |
||||
# andabade -> and |
||||
{"abade",2}, |
||||
# andábade -> and |
||||
{"ábade",2}, |
||||
# chorabamo -> chor |
||||
{"abamo",2}, |
||||
# chorábamo -> chor |
||||
{"ábamo",2}, |
||||
# moraban -> morab |
||||
{"aban",2}, |
||||
# andache -> and |
||||
{"ache",2}, |
||||
# andade -> and |
||||
{"ade",2}, |
||||
{"an",2}, |
||||
# cantando -> cant |
||||
{"ando",2}, |
||||
# cantar -> cant |
||||
{"ar",2,"",{"azar","bazar","patamar"}}, |
||||
# lembrarade -> lembra |
||||
{"arade",2}, |
||||
{"aramo",2}, |
||||
{"arán",2}, |
||||
# cantaran -> cant |
||||
{"aran",2}, |
||||
# convidárade -> convid |
||||
{"árade",2}, |
||||
# convidaría -> convid |
||||
{"aría",2}, |
||||
# cantariade -> cant |
||||
{"ariade",2}, |
||||
# cantaríade -> cant |
||||
{"aríade",2}, |
||||
# cantarian -> cant |
||||
{"arian",2}, |
||||
# cantariamo -> cant |
||||
{"ariamo",2}, |
||||
# pescaron -> pesc |
||||
{"aron",2}, |
||||
# cantase -> cant |
||||
{"ase",2}, |
||||
# cantasede -> cant |
||||
{"asede",2}, |
||||
# cantásede -> cant |
||||
{"ásede",2}, |
||||
# cantasemo -> cant |
||||
{"asemo",2}, |
||||
# cantásemo -> cant |
||||
{"ásemo",2}, |
||||
# cantasen -> cant |
||||
{"asen",2}, |
||||
# loitavan -> loitav |
||||
{"avan",2}, |
||||
# cantaríamo -> cant |
||||
{"aríamo",2}, |
||||
# cantassen -> cant |
||||
{"assen",2}, |
||||
# cantássemo -> cant |
||||
{"ássemo",2}, |
||||
# beberíamo -> beb |
||||
{"eríamo",2}, |
||||
# bebêssemo -> beb |
||||
{"êssemo",2}, |
||||
# partiríamo -> part |
||||
{"iríamo",3}, |
||||
# partíssemo -> part |
||||
{"íssemo",3}, |
||||
# cantáramo -> cant |
||||
{"áramo",2}, |
||||
# cantárei -> cant |
||||
{"árei",2}, |
||||
# cantaren -> cant |
||||
{"aren",2}, |
||||
# cantaremo -> cant |
||||
{"aremo",2}, |
||||
# cantaríei -> cant |
||||
{"aríei",2}, |
||||
{"ássei",2}, |
||||
# cantávamo-> cant |
||||
{"ávamo",2}, |
||||
# bebêramo -> beb |
||||
{"êramo",1}, |
||||
# beberemo -> beb |
||||
{"eremo",1}, |
||||
# beberíei -> beb |
||||
{"eríei",1}, |
||||
# bebêssei -> beb |
||||
{"êssei",1}, |
||||
# partiríamo -> part |
||||
{"íramo",3}, |
||||
# partiremo -> part |
||||
{"iremo",3}, |
||||
# partiríei -> part |
||||
{"iríei",3}, |
||||
# partíssei -> part |
||||
{"íssei",3}, |
||||
# partissen -> part |
||||
{"issen",3}, |
||||
# bebendo -> beb |
||||
{"endo",1}, |
||||
# partindo -> part |
||||
{"indo",3}, |
||||
# propondo -> prop |
||||
{"ondo",3}, |
||||
# cantarde -> cant |
||||
{"arde",2}, |
||||
# cantarei -> cant |
||||
{"arei",2}, |
||||
# cantaria -> cant |
||||
{"aria",2}, |
||||
# cantarmo -> cant |
||||
{"armo",2}, |
||||
# cantasse -> cant |
||||
{"asse",2}, |
||||
{"aste",2}, |
||||
# cantávei -> cant |
||||
{"ávei",2}, |
||||
# perderão -> perd |
||||
{"erão",1}, |
||||
# beberde -> beb |
||||
{"erde",1}, |
||||
# beberei -> beb |
||||
{"erei",1}, |
||||
# bebêrei -> beb |
||||
{"êrei",1}, |
||||
# beberen -> beb |
||||
{"eren",2}, |
||||
# beberia -> beb |
||||
{"eria",1}, |
||||
# bebermo -> beb |
||||
{"ermo",1}, |
||||
# bebeste -> beb |
||||
{"este",1,"",{"faroeste","agreste"}}, |
||||
# bebíamo -> beb |
||||
{"íamo",1}, |
||||
# fuxian -> fux |
||||
{"ian",2,"",{"enfian","eloxian","ensaian"}}, |
||||
# partirde -> part |
||||
{"irde",2}, |
||||
# partírei -> part |
||||
{"irei",3,"",{"admirei"}}, |
||||
# partiren -> part |
||||
{"iren",3}, |
||||
# partiria -> part |
||||
{"iria",3}, |
||||
# partirmo -> part |
||||
{"irmo",3}, |
||||
# partisse -> part |
||||
{"isse",3}, |
||||
# partiste -> part |
||||
{"iste",4}, |
||||
{"iava",1,"",{"ampliava"}}, |
||||
# cantamo -> cant |
||||
{"amo",2}, |
||||
# funciona -> func |
||||
{"iona",3}, |
||||
# cantara -> cant |
||||
{"ara",2,"",{"arara","prepara"}}, |
||||
# enviará -> envi |
||||
{"ará",2,"",{"alvará","bacará"}}, |
||||
# cantare -> cant |
||||
{"are",2,"",{"prepare"}}, |
||||
# cantava -> cant |
||||
{"ava",2,"",{"agrava"}}, |
||||
# cantemo -> cant |
||||
{"emo",2}, |
||||
# bebera -> beb |
||||
{"era",1,"",{"acelera","espera"}}, |
||||
# beberá -> beb |
||||
{"erá",1}, |
||||
# bebere -> beb |
||||
{"ere",1,"",{"espere"}}, |
||||
# bebíei -> beb |
||||
{"íei",1}, |
||||
# metin -> met |
||||
{"in",3}, |
||||
# partimo -> part |
||||
{"imo",3,"",{"reprimo","intimo","íntimo","nimo","queimo","ximo"}}, |
||||
# partira -> part |
||||
{"ira",3,"",{"fronteira","sátira"}}, |
||||
{"ído",3}, |
||||
# partirá -> part |
||||
{"irá",3}, |
||||
# concretizar -> concret |
||||
{"tizar",4,"",{"alfabetizar"}}, |
||||
{"izar",3,"",{"organizar"}}, |
||||
# saltitar -> salt |
||||
{"itar",5,"",{"acreditar","explicitar","estreitar"}}, |
||||
# partire -> part |
||||
{"ire",3,"",{"adquire"}}, |
||||
# compomo -> comp |
||||
{"omo",3}, |
||||
{"ai",2}, |
||||
# barbear -> barb |
||||
{"ear",4,"",{"alardear","nuclear"}}, |
||||
# cheguei -> cheg |
||||
{"uei",3}, |
||||
{"uía",5,"u"}, |
||||
# cantei -> cant |
||||
{"ei",3}, |
||||
# beber -> beb |
||||
{"er",1,"",{"éter","pier"}}, |
||||
# bebeu -> beb |
||||
{"eu",1,"",{"chapeu"}}, |
||||
# bebia -> beb |
||||
{"ia",1,"",{"estória","fatia","acia","praia","elogia","mania","lábia","aprecia","polícia","arredia","cheia","ásia"}}, |
||||
# partir -> part |
||||
{"ir",3}, |
||||
# partiu -> part |
||||
{"iu",3}, |
||||
# fraqueou -> fraqu |
||||
{"eou",5}, |
||||
# chegou -> cheg |
||||
{"ou",3}, |
||||
# bebi -> beb |
||||
{"i",1}, |
||||
# varrede -> varr |
||||
{"ede",1,"",{"rede","bípede","céspede","parede","palmípede","vostede","hóspede","adrede"}}, |
||||
# cantei -> cant |
||||
{"ei",3}, |
||||
# anden -> and |
||||
{"en",2}, |
||||
# descerade -> desc |
||||
{"erade",1}, |
||||
# vivérade -> viv |
||||
{"érade",1}, |
||||
# beberan -> beb |
||||
{"eran",2}, |
||||
# colleramo -> coller |
||||
{"eramo",1}, |
||||
# bebéramo -> beb |
||||
{"éramo",1}, |
||||
# perderán -> perd |
||||
{"erán",1}, |
||||
# varrería -> varr |
||||
{"ería",1}, |
||||
# beberiade -> beb |
||||
{"eriade",1}, |
||||
# beberíade -> beb |
||||
{"eríade",1}, |
||||
# beberiamo -> beb |
||||
{"eriamo",1}, |
||||
# beberian -> beb |
||||
{"erian",1}, |
||||
# beberían -> beb |
||||
{"erían",1}, |
||||
# perderon -> perd |
||||
{"eron",1}, |
||||
# bebese -> beb |
||||
{"ese",1}, |
||||
# bebesedes -> beb |
||||
{"esedes",1}, |
||||
# bebésedes -> beb |
||||
{"ésedes",1}, |
||||
# bebesemo -> beb |
||||
{"esemo",1}, |
||||
# bebésemo -> beb |
||||
{"ésemo",1}, |
||||
# bebesen -> beb |
||||
{"esen",1}, |
||||
# bebêssede -> beb |
||||
{"êssede",1}, |
||||
# chovía -> chov |
||||
{"ía",1}, |
||||
# faciade -> fac |
||||
{"iade",1}, |
||||
# facíade -> fac |
||||
{"íade",1}, |
||||
# perdiamo -> perd |
||||
{"iamo",1}, |
||||
# fuxían -> fux |
||||
{"ían",1}, |
||||
# corriche -> corr |
||||
{"iche",1}, |
||||
# partide -> part |
||||
{"ide",1}, |
||||
# escribirade -> escrib |
||||
{"irade",3}, |
||||
# parírade -> par |
||||
{"írade",3}, |
||||
# partiramo -> part |
||||
{"iramo",3}, |
||||
# fugirán -> fug |
||||
{"irán",3}, |
||||
# viviría -> viv |
||||
{"iría",3}, |
||||
# partiriade -> part |
||||
{"iriade",3}, |
||||
# partiríade -> part |
||||
{"iríade",3}, |
||||
# partiriamo -> part |
||||
{"iriamo",3}, |
||||
# partirian -> part |
||||
{"irian",3}, |
||||
# partirían -> part |
||||
{"irían",3}, |
||||
# reflectiron -> reflect |
||||
{"iron",3}, |
||||
# partise -> part |
||||
{"ise",3}, |
||||
# partisede -> part |
||||
{"isede",3}, |
||||
# partísede -> part |
||||
{"ísede",3}, |
||||
# partisemo -> part |
||||
{"isemo",3}, |
||||
# partísemo -> part |
||||
{"ísemo",3}, |
||||
# partisen -> part |
||||
{"isen",3}, |
||||
# partíssede -> part |
||||
{"íssede",3}, |
||||
{"tizar",3,"",{"alfabetizar"}}, |
||||
{"ondo",3}}; |
||||
|
||||
{ "Vowel", 0, 0, {}, |
||||
# segue -> seg |
||||
{"gue",2,"g",{"azougue","dengue","merengue","nurague","merengue","rengue"}}, |
||||
{"que",2,"c",{"alambique","albaricoque","abaroque","alcrique","almadraque","almanaque","arenque","arinque","baduloque","ballestrinque","betoque","bivaque","bloque","bodaque","bosque","breque","buque","cacique","cheque","claque","contradique","coque","croque","dique","duque","enroque","espeque","estoque","estoraque","estraloque","estrinque","milicroque","monicreque","orinque","arinque","palenque","parque","penique","picabeque","pique","psique","raque","remolque","xeque","repenique","roque","sotobosque","tabique","tanque","toque","traque","truque","vivaque","xaque"}}, |
||||
{"a",3,"",{"amasadela","cerva"}}, |
||||
{"e",3,"",{"marte"}}, |
||||
{"o",3,"",{"barro","fado","cabo","libro","cervo"}}, |
||||
{"â",3}, |
||||
{"ã",3,"",{"amanhã","arapuã","fã","divã","manhã"}}, |
||||
{"ê",3}, |
||||
{"ô",3}, |
||||
{"á",3}, |
||||
{"é",3}, |
||||
{"ó",3}, |
||||
# munxi -> munx |
||||
{"i",3}}; |
@ -0,0 +1,161 @@
|
||||
# galican stopwords |
||||
a |
||||
aínda |
||||
alí |
||||
aquel |
||||
aquela |
||||
aquelas |
||||
aqueles |
||||
aquilo |
||||
aquí |
||||
ao |
||||
aos |
||||
as |
||||
así |
||||
á |
||||
ben |
||||
cando |
||||
che |
||||
co |
||||
coa |
||||
comigo |
||||
con |
||||
connosco |
||||
contigo |
||||
convosco |
||||
coas |
||||
cos |
||||
cun |
||||
cuns |
||||
cunha |
||||
cunhas |
||||
da |
||||
dalgunha |
||||
dalgunhas |
||||
dalgún |
||||
dalgúns |
||||
das |
||||
de |
||||
del |
||||
dela |
||||
delas |
||||
deles |
||||
desde |
||||
deste |
||||
do |
||||
dos |
||||
dun |
||||
duns |
||||
dunha |
||||
dunhas |
||||
e |
||||
el |
||||
ela |
||||
elas |
||||
eles |
||||
en |
||||
era |
||||
eran |
||||
esa |
||||
esas |
||||
ese |
||||
eses |
||||
esta |
||||
estar |
||||
estaba |
||||
está |
||||
están |
||||
este |
||||
estes |
||||
estiven |
||||
estou |
||||
eu |
||||
é |
||||
facer |
||||
foi |
||||
foron |
||||
fun |
||||
había |
||||
hai |
||||
iso |
||||
isto |
||||
la |
||||
las |
||||
lle |
||||
lles |
||||
lo |
||||
los |
||||
mais |
||||
me |
||||
meu |
||||
meus |
||||
min |
||||
miña |
||||
miñas |
||||
moi |
||||
na |
||||
nas |
||||
neste |
||||
nin |
||||
no |
||||
non |
||||
nos |
||||
nosa |
||||
nosas |
||||
noso |
||||
nosos |
||||
nós |
||||
nun |
||||
nunha |
||||
nuns |
||||
nunhas |
||||
o |
||||
os |
||||
ou |
||||
ó |
||||
ós |
||||
para |
||||
pero |
||||
pode |
||||
pois |
||||
pola |
||||
polas |
||||
polo |
||||
polos |
||||
por |
||||
que |
||||
se |
||||
senón |
||||
ser |
||||
seu |
||||
seus |
||||
sexa |
||||
sido |
||||
sobre |
||||
súa |
||||
súas |
||||
tamén |
||||
tan |
||||
te |
||||
ten |
||||
teñen |
||||
teño |
||||
ter |
||||
teu |
||||
teus |
||||
ti |
||||
tido |
||||
tiña |
||||
tiven |
||||
túa |
||||
túas |
||||
un |
||||
unha |
||||
unhas |
||||
uns |
||||
vos |
||||
vosa |
||||
vosas |
||||
voso |
||||
vosos |
||||
vós |
@ -0,0 +1,46 @@
|
||||
# example set of Armenian stopwords. |
||||
այդ |
||||
այլ |
||||
այն |
||||
այս |
||||
դու |
||||
դուք |
||||
եմ |
||||
են |
||||
ենք |
||||
ես |
||||
եք |
||||
է |
||||
էի |
||||
էին |
||||
էինք |
||||
էիր |
||||
էիք |
||||
էր |
||||
ըստ |
||||
թ |
||||
ի |
||||
ին |
||||
իսկ |
||||
իր |
||||
կամ |
||||
համար |
||||
հետ |
||||
հետո |
||||
մենք |
||||
մեջ |
||||
մի |
||||
ն |
||||
նա |
||||
նաև |
||||
նրա |
||||
նրանք |
||||
որ |
||||
որը |
||||
որոնք |
||||
որպես |
||||
ու |
||||
ում |
||||
պիտի |
||||
վրա |
||||
և |
@ -0,0 +1,359 @@
|
||||
# from appendix D of: A Study of Stemming Effects on Information |
||||
# Retrieval in Bahasa Indonesia |
||||
ada |
||||
adanya |
||||
adalah |
||||
adapun |
||||
agak |
||||
agaknya |
||||
agar |
||||
akan |
||||
akankah |
||||
akhirnya |
||||
aku |
||||
akulah |
||||
amat |
||||
amatlah |
||||
anda |
||||
andalah |
||||
antar |
||||
diantaranya |
||||
antara |
||||
antaranya |
||||
diantara |
||||
apa |
||||
apaan |
||||
mengapa |
||||
apabila |
||||
apakah |
||||
apalagi |
||||
apatah |
||||
atau |
||||
ataukah |
||||
ataupun |
||||
bagai |
||||
bagaikan |
||||
sebagai |
||||
sebagainya |
||||
bagaimana |
||||
bagaimanapun |
||||
sebagaimana |
||||
bagaimanakah |
||||
bagi |
||||
bahkan |
||||
bahwa |
||||
bahwasanya |
||||
sebaliknya |
||||
banyak |
||||
sebanyak |
||||
beberapa |
||||
seberapa |
||||
begini |
||||
beginian |
||||
beginikah |
||||
beginilah |
||||
sebegini |
||||
begitu |
||||
begitukah |
||||
begitulah |
||||
begitupun |
||||
sebegitu |
||||
belum |
||||
belumlah |
||||
sebelum |
||||
sebelumnya |
||||
sebenarnya |
||||
berapa |
||||
berapakah |
||||
berapalah |
||||
berapapun |
||||
betulkah |
||||
sebetulnya |
||||
biasa |
||||
biasanya |
||||
bila |
||||
bilakah |
||||
bisa |
||||
bisakah |
||||
sebisanya |
||||
boleh |
||||
bolehkah |
||||
bolehlah |
||||
buat |
||||
bukan |
||||
bukankah |
||||
bukanlah |
||||
bukannya |
||||
cuma |
||||
percuma |
||||
dahulu |
||||
dalam |
||||
dan |
||||
dapat |
||||
dari |
||||
daripada |
||||
dekat |
||||
demi |
||||
demikian |
||||
demikianlah |
||||
sedemikian |
||||
dengan |
||||
depan |
||||
di |
||||
dia |
||||
dialah |
||||
dini |
||||
diri |
||||
dirinya |
||||
terdiri |
||||
dong |
||||
dulu |
||||
enggak |
||||
enggaknya |
||||
entah |
||||
entahlah |
||||
terhadap |
||||
terhadapnya |
||||
hal |
||||
hampir |
||||
hanya |
||||
hanyalah |
||||
harus |
||||
haruslah |
||||
harusnya |
||||
seharusnya |
||||
hendak |
||||
hendaklah |
||||
hendaknya |
||||
hingga |
||||
sehingga |
||||
ia |
||||
ialah |
||||
ibarat |
||||
ingin |
||||
inginkah |
||||
inginkan |
||||
ini |
||||
inikah |
||||
inilah |
||||
itu |
||||
itukah |
||||
itulah |
||||
jangan |
||||
jangankan |
||||
janganlah |
||||
jika |
||||
jikalau |
||||
juga |
||||
justru |
||||
kala |
||||
kalau |
||||
kalaulah |
||||
kalaupun |
||||
kalian |
||||
kami |
||||
kamilah |
||||
kamu |
||||
kamulah |
||||
kan |
||||
kapan |
||||
kapankah |
||||
kapanpun |
||||
dikarenakan |
||||
karena |
||||
karenanya |
||||
ke |
||||
kecil |
||||
kemudian |
||||
kenapa |
||||
kepada |
||||
kepadanya |
||||
ketika |
||||
seketika |
||||
khususnya |
||||
kini |
||||
kinilah |
||||
kiranya |
||||
sekiranya |
||||
kita |
||||
kitalah |
||||
kok |
||||
lagi |
||||
lagian |
||||
selagi |
||||
lah |
||||
lain |
||||
lainnya |
||||
melainkan |
||||
selaku |
||||
lalu |
||||
melalui |
||||
terlalu |
||||
lama |
||||
lamanya |
||||
selama |
||||
selama |
||||
selamanya |
||||
lebih |
||||
terlebih |
||||
bermacam |
||||
macam |
||||
semacam |
||||
maka |
||||
makanya |
||||
makin |
||||
malah |
||||
malahan |
||||
mampu |
||||
mampukah |
||||
mana |
||||
manakala |
||||
manalagi |
||||
masih |
||||
masihkah |
||||
semasih |
||||
masing |
||||
mau |
||||
maupun |
||||
semaunya |
||||
memang |
||||
mereka |
||||
merekalah |
||||
meski |
||||
meskipun |
||||
semula |
||||
mungkin |
||||
mungkinkah |
||||
nah |
||||
namun |
||||
nanti |
||||
nantinya |
||||
nyaris |
||||
oleh |
||||
olehnya |
||||
seorang |
||||
seseorang |
||||
pada |
||||
padanya |
||||
padahal |
||||
paling |
||||
sepanjang |
||||
pantas |
||||
sepantasnya |
||||
sepantasnyalah |
||||
para |
||||
pasti |
||||
pastilah |
||||
per |
||||
pernah |
||||
pula |
||||
pun |
||||
merupakan |
||||
rupanya |
||||
serupa |
||||
saat |
||||
saatnya |
||||
sesaat |
||||
saja |
||||
sajalah |
||||
saling |
||||
bersama |
||||
sama |
||||
sesama |
||||
sambil |
||||
sampai |
||||
sana |
||||
sangat |
||||
sangatlah |
||||
saya |
||||
sayalah |
||||
se |
||||
sebab |
||||
sebabnya |
||||
sebuah |
||||
tersebut |
||||
tersebutlah |
||||
sedang |
||||
sedangkan |
||||
sedikit |
||||
sedikitnya |
||||
segala |
||||
segalanya |
||||
segera |
||||
sesegera |
||||
sejak |
||||
sejenak |
||||
sekali |
||||
sekalian |
||||
sekalipun |
||||
sesekali |
||||
sekaligus |
||||
sekarang |
||||
sekarang |
||||
sekitar |
||||
sekitarnya |
||||
sela |
||||
selain |
||||
selalu |
||||
seluruh |
||||
seluruhnya |
||||
semakin |
||||
sementara |
||||
sempat |
||||
semua |
||||
semuanya |
||||
sendiri |
||||
sendirinya |
||||
seolah |
||||
seperti |
||||
sepertinya |
||||
sering |
||||
seringnya |
||||
serta |
||||
siapa |
||||
siapakah |
||||
siapapun |
||||
disini |
||||
disinilah |
||||
sini |
||||
sinilah |
||||
sesuatu |
||||
sesuatunya |
||||
suatu |
||||
sesudah |
||||
sesudahnya |
||||
sudah |
||||
sudahkah |
||||
sudahlah |
||||
supaya |
||||
tadi |
||||
tadinya |
||||
tak |
||||
tanpa |
||||
setelah |
||||
telah |
||||
tentang |
||||
tentu |
||||
tentulah |
||||
tentunya |
||||
tertentu |
||||
seterusnya |
||||
tapi |
||||
tetapi |
||||
setiap |
||||
tiap |
||||
setidaknya |
||||
tidak |
||||
tidakkah |
||||
tidaklah |
||||
toh |
||||
waduh |
||||
wah |
||||
wahai |
||||
sewaktu |
||||
walau |
||||
walaupun |
||||
wong |
||||
yaitu |
||||
yakni |
||||
yang |
@ -0,0 +1,172 @@
|
||||
# Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins |
||||
# the original list of over 800 forms was refined: |
||||
# pronouns, adverbs, interjections were removed |
||||
# |
||||
# prepositions |
||||
aiz |
||||
ap |
||||
ar |
||||
apakš |
||||
ārpus |
||||
augšpus |
||||
bez |
||||
caur |
||||
dēļ |
||||
gar |
||||
iekš |
||||
iz |
||||
kopš |
||||
labad |
||||
lejpus |
||||
līdz |
||||
no |
||||
otrpus |
||||
pa |
||||
par |
||||
pār |
||||
pēc |
||||
pie |
||||
pirms |
||||
pret |
||||
priekš |
||||
starp |
||||
šaipus |
||||
uz |
||||
viņpus |
||||
virs |
||||
virspus |
||||
zem |
||||
apakšpus |
||||
# Conjunctions |
||||
un |
||||
bet |
||||
jo |
||||
ja |
||||
ka |
||||
lai |
||||
tomēr |
||||
tikko |
||||
turpretī |
||||
arī |
||||
kaut |
||||
gan |
||||
tādēļ |
||||
tā |
||||
ne |
||||
tikvien |
||||
vien |
||||
kā |
||||
ir |
||||
te |
||||
vai |
||||
kamēr |
||||
# Particles |
||||
ar |
||||
diezin |
||||
droši |
||||
diemžēl |
||||
nebūt |
||||
ik |
||||
it |
||||
taču |
||||
nu |
||||
pat |
||||
tiklab |
||||
iekšpus |
||||
nedz |
||||
tik |
||||
nevis |
||||
turpretim |
||||
jeb |
||||
iekam |
||||
iekām |
||||
iekāms |
||||
kolīdz |
||||
līdzko |
||||
tiklīdz |
||||
jebšu |
||||
tālab |
||||
tāpēc |
||||
nekā |
||||
itin |
||||
jā |
||||
jau |
||||
jel |
||||
nē |
||||
nezin |
||||
tad |
||||
tikai |
||||
vis |
||||
tak |
||||
iekams |
||||
vien |
||||
# modal verbs |
||||
būt |
||||
biju |
||||
biji |
||||
bija |
||||
bijām |
||||
bijāt |
||||
esmu |
||||
esi |
||||
esam |
||||
esat |
||||
būšu |
||||
būsi |
||||
būs |
||||
būsim |
||||
būsiet |
||||
tikt |
||||
tiku |
||||
tiki |
||||
tika |
||||
tikām |
||||
tikāt |
||||
tieku |
||||
tiec |
||||
tiek |
||||
tiekam |
||||
tiekat |
||||
tikšu |
||||
tiks |
||||
tiksim |
||||
tiksiet |
||||
tapt |
||||
tapi |
||||
tapāt |
||||
topat |
||||
tapšu |
||||
tapsi |
||||
taps |
||||
tapsim |
||||
tapsiet |
||||
kļūt |
||||
kļuvu |
||||
kļuvi |
||||
kļuva |
||||
kļuvām |
||||
kļuvāt |
||||
kļūstu |
||||
kļūsti |
||||
kļūst |
||||
kļūstam |
||||
kļūstat |
||||
kļūšu |
||||
kļūsi |
||||
kļūs |
||||
kļūsim |
||||
kļūsiet |
||||
# verbs |
||||
varēt |
||||
varēju |
||||
varējām |
||||
varēšu |
||||
varēsim |
||||
var |
||||
varēji |
||||
varējāt |
||||
varēsi |
||||
varēsiet |
||||
varat |
||||
varēja |
||||
varēs |
@ -0,0 +1,456 @@
|
||||
# Steps file for the RSLP stemmer. |
||||
|
||||
# Step 1: Plural Reduction |
||||
{ "Plural", 3, 1, {"s"}, |
||||
# bons -> bom |
||||
{"ns",1,"m"}, |
||||
# balões -> balão |
||||
{"ões",3,"ão"}, |
||||
# capitães -> capitão |
||||
{"ães",1,"ão",{"mães"}}, |
||||
# normais -> normal |
||||
{"ais",1,"al",{"cais","mais"}}, |
||||
# papéis -> papel |
||||
{"éis",2,"el"}, |
||||
# amáveis -> amável |
||||
{"eis",2,"el"}, |
||||
# lençóis -> lençol |
||||
{"óis",2,"ol"}, |
||||
# barris -> barril |
||||
{"is",2,"il",{"lápis","cais","mais","crúcis","biquínis","pois","depois","dois","leis"}}, |
||||
# males -> mal |
||||
{"les",3,"l"}, |
||||
# mares -> mar |
||||
{"res",3,"r", {"árvores"}}, |
||||
# casas -> casa |
||||
{"s",2,"",{"aliás","pires","lápis","cais","mais","mas","menos","férias","fezes","pêsames","crúcis","gás","atrás","moisés","através","convés","ês","país","após","ambas","ambos","messias", "depois"}}}; |
||||
|
||||
# Step 2: Adverb Reduction |
||||
{ "Adverb", 0, 0, {}, |
||||
# felizmente -> feliz |
||||
{"mente",4,"",{"experimente"}}}; |
||||
|
||||
# Step 3: Feminine Reduction |
||||
{ "Feminine", 3, 1, {"a","ã"}, |
||||
# chefona -> chefão |
||||
{"ona",3,"ão",{"abandona","lona","iona","cortisona","monótona","maratona","acetona","detona","carona"}}, |
||||
# vilã -> vilão |
||||
{"ã",2,"ão",{"amanhã","arapuã","fã","divã"}}, |
||||
# professora -> professor |
||||
{"ora",3,"or"}, |
||||
# americana -> americano |
||||
{"na",4,"no",{"carona","abandona","lona","iona","cortisona","monótona","maratona","acetona","detona","guiana","campana","grana","caravana","banana","paisana"}}, |
||||
# sozinha -> sozinho |
||||
{"inha",3,"inho",{"rainha","linha","minha"}}, |
||||
# inglesa -> inglês |
||||
{"esa",3,"ês",{"mesa","obesa","princesa","turquesa","ilesa","pesa","presa"}}, |
||||
# famosa -> famoso |
||||
{"osa",3,"oso",{"mucosa","prosa"}}, |
||||
# maníaca -> maníaco |
||||
{"íaca",3,"íaco"}, |
||||
# prática -> prático |
||||
{"ica",3,"ico",{"dica"}}, |
||||
# cansada -> cansado |
||||
{"ada",2,"ado",{"pitada"}}, |
||||
# mantida -> mantido |
||||
{"ida",3,"ido",{"vida","dúvida"}}, |
||||
{"ída",3,"ido",{"recaída","saída"}}, |
||||
# prima -> primo |
||||
{"ima",3,"imo",{"vítima"}}, |
||||
# passiva -> passivo |
||||
{"iva",3,"ivo",{"saliva","oliva"}}, |
||||
# primeira -> primeiro |
||||
{"eira",3,"eiro",{"beira","cadeira","frigideira","bandeira","feira","capoeira","barreira","fronteira","besteira","poeira"}}}; |
||||
|
||||
# Step 4: Augmentative/Diminutive Reduction |
||||
{ "Augmentative", 0, 1, {}, |
||||
# cansadíssimo -> cansad |
||||
{"díssimo",5}, |
||||
# amabilíssimo -> ama |
||||
{"abilíssimo",5}, |
||||
# fortíssimo -> fort |
||||
{"íssimo",3}, |
||||
{"ésimo",3}, |
||||
# chiquérrimo -> chiqu |
||||
{"érrimo",4}, |
||||
# pezinho -> pe |
||||
{"zinho",2}, |
||||
# maluquinho -> maluc |
||||
{"quinho",4,"c"}, |
||||
# amiguinho -> amig |
||||
{"uinho",4}, |
||||
# cansadinho -> cansad |
||||
{"adinho",3}, |
||||
# carrinho -> carr |
||||
{"inho",3,"",{"caminho","cominho"}}, |
||||
# grandalhão -> grand |
||||
{"alhão",4}, |
||||
# dentuça -> dent |
||||
{"uça",4}, |
||||
# ricaço -> ric |
||||
{"aço",4,"",{"antebraço"}}, |
||||
{"aça",4}, |
||||
# casadão -> cans |
||||
{"adão",4}, |
||||
{"idão",4}, |
||||
# corpázio -> corp |
||||
{"ázio",3,"",{"topázio"}}, |
||||
# pratarraz -> prat |
||||
{"arraz",4}, |
||||
{"zarrão",3}, |
||||
{"arrão",4}, |
||||
# bocarra -> boc |
||||
{"arra",3}, |
||||
# calorzão -> calor |
||||
{"zão",2,"",{"coalizão"}}, |
||||
# meninão -> menin |
||||
{"ão",3,"",{"camarão","chimarrão","canção","coração","embrião","grotão","glutão","ficção","fogão","feição","furacão","gamão","lampião","leão","macacão","nação","órfão","orgão","patrão","portão","quinhão","rincão","tração","falcão","espião","mamão","folião","cordão","aptidão","campeão","colchão","limão","leilão","melão","barão","milhão","bilhão","fusão","cristão","ilusão","capitão","estação","senão"}}}; |
||||
|
||||
# Step 5: Noun Suffix Reduction |
||||
{ "Noun", 0, 0, {}, |
||||
# existencialista -> exist |
||||
{"encialista",4}, |
||||
# minimalista -> minim |
||||
{"alista",5}, |
||||
# contagem -> cont |
||||
{"agem",3,"",{"coragem","chantagem","vantagem","carruagem"}}, |
||||
# gerenciamento -> gerenc |
||||
{"iamento",4}, |
||||
# monitoramento -> monitor |
||||
{"amento",3,"",{"firmamento","fundamento","departamento"}}, |
||||
# nascimento -> nasc |
||||
{"imento",3}, |
||||
{"mento",6,"",{"firmamento","elemento","complemento","instrumento","departamento"}}, |
||||
# comercializado -> comerci |
||||
{"alizado",4}, |
||||
# traumatizado -> traum |
||||
{"atizado",4}, |
||||
{"tizado",4,"",{"alfabetizado"}}, |
||||
# alfabetizado -> alfabet |
||||
{"izado",5,"",{"organizado","pulverizado"}}, |
||||
# associativo -> associ |
||||
{"ativo",4,"",{"pejorativo","relativo"}}, |
||||
# contraceptivo -> contracep |
||||
{"tivo",4,"",{"relativo"}}, |
||||
# esportivo -> esport |
||||
{"ivo",4,"",{"passivo","possessivo","pejorativo","positivo"}}, |
||||
# abalado -> abal |
||||
{"ado",2,"",{"grado"}}, |
||||
# impedido -> imped |
||||
{"ido",3,"",{"cândido","consolido","rápido","decido","tímido","duvido","marido"}}, |
||||
# ralador -> ral |
||||
{"ador",3}, |
||||
# entendedor -> entend |
||||
{"edor",3}, |
||||
# cumpridor -> cumpr |
||||
{"idor",4,"",{"ouvidor"}}, |
||||
{"dor",4,"",{"ouvidor"}}, |
||||
{"sor",4,"",{"assessor"}}, |
||||
{"atoria",5}, |
||||
{"tor",3,"",{"benfeitor","leitor","editor","pastor","produtor","promotor","consultor"}}, |
||||
{"or",2,"",{"motor","melhor","redor","rigor","sensor","tambor","tumor","assessor","benfeitor","pastor","terior","favor","autor"}}, |
||||
# comparabilidade -> compar |
||||
{"abilidade",5}, |
||||
# abolicionista -> abol |
||||
{"icionista",4}, |
||||
# intervencionista -> interven |
||||
{"cionista",5}, |
||||
{"ionista",5}, |
||||
{"ionar",5}, |
||||
# profissional -> profiss |
||||
{"ional",4}, |
||||
# referência -> refer |
||||
{"ência",3}, |
||||
# repugnância -> repugn |
||||
{"ância",4,"",{"ambulância"}}, |
||||
# abatedouro -> abat |
||||
{"edouro",3}, |
||||
# fofoqueiro -> fofoc |
||||
{"queiro",3,"c"}, |
||||
{"adeiro",4,"",{"desfiladeiro"}}, |
||||
# brasileiro -> brasil |
||||
{"eiro",3,"",{"desfiladeiro","pioneiro","mosteiro"}}, |
||||
{"uoso",3}, |
||||
# gostoso -> gost |
||||
{"oso",3,"",{"precioso"}}, |
||||
# comercializaç -> comerci |
||||
{"alizaç",5}, |
||||
{"atizaç",5}, |
||||
{"tizaç",5}, |
||||
{"izaç",5,"",{"organizaç"}}, |
||||
# alegaç -> aleg |
||||
{"aç",3,"",{"equaç","relaç"}}, |
||||
# aboliç -> abol |
||||
{"iç",3,"",{"eleiç"}}, |
||||
# anedotário -> anedot |
||||
{"ário",3,"",{"voluntário","salário","aniversário","diário","lionário","armário"}}, |
||||
{"atório",3}, |
||||
{"rio",5,"",{"voluntário","salário","aniversário","diário","compulsório","lionário","próprio","stério","armário"}}, |
||||
# ministério -> minist |
||||
{"ério",6}, |
||||
# chinês -> chin |
||||
{"ês",4}, |
||||
# beleza -> bel |
||||
{"eza",3}, |
||||
# rigidez -> rigid |
||||
{"ez",4}, |
||||
# parentesco -> parent |
||||
{"esco",4}, |
||||
# ocupante -> ocup |
||||
{"ante",2,"",{"gigante","elefante","adiante","possante","instante","restaurante"}}, |
||||
# bombástico -> bomb |
||||
{"ástico",4,"",{"eclesiástico"}}, |
||||
{"alístico",3}, |
||||
{"áutico",4}, |
||||
{"êutico",4}, |
||||
{"tico",3,"",{"político","eclesiástico","diagnostico","prático","doméstico","diagnóstico","idêntico","alopático","artístico","autêntico","eclético","crítico","critico"}}, |
||||
# polêmico -> polêm |
||||
{"ico",4,"",{"tico","público","explico"}}, |
||||
# produtividade -> produt |
||||
{"ividade",5}, |
||||
# profundidade -> profund |
||||
{"idade",4,"",{"autoridade","comunidade"}}, |
||||
# aposentadoria -> aposentad |
||||
{"oria",4,"",{"categoria"}}, |
||||
# existencial -> exist |
||||
{"encial",5}, |
||||
# artista -> art |
||||
{"ista",4}, |
||||
{"auta",5}, |
||||
# maluquice -> maluc |
||||
{"quice",4,"c"}, |
||||
# chatice -> chat |
||||
{"ice",4,"",{"cúmplice"}}, |
||||
# demoníaco -> demon |
||||
{"íaco",3}, |
||||
# decorrente -> decorr |
||||
{"ente",4,"",{"freqüente","alimente","acrescente","permanente","oriente","aparente"}}, |
||||
{"ense",5}, |
||||
# criminal -> crim |
||||
{"inal",3}, |
||||
# americano -> americ |
||||
{"ano",4}, |
||||
# amável -> am |
||||
{"ável",2,"",{"afável","razoável","potável","vulnerável"}}, |
||||
# combustível -> combust |
||||
{"ível",3,"",{"possível"}}, |
||||
{"vel",5,"",{"possível","vulnerável","solúvel"}}, |
||||
{"bil",3,"vel"}, |
||||
# cobertura -> cobert |
||||
{"ura",4,"",{"imatura","acupuntura","costura"}}, |
||||
{"ural",4}, |
||||
# consensual -> consens |
||||
{"ual",3,"",{"bissexual","virtual","visual","pontual"}}, |
||||
# mundial -> mund |
||||
{"ial",3}, |
||||
# experimental -> experiment |
||||
{"al",4,"",{"afinal","animal","estatal","bissexual","desleal","fiscal","formal","pessoal","liberal","postal","virtual","visual","pontual","sideral","sucursal"}}, |
||||
{"alismo",4}, |
||||
{"ivismo",4}, |
||||
{"ismo",3,"",{"cinismo"}}}; |
||||
|
||||
# Step 6: Verb Suffix Reduction |
||||
{ "Verb", 0, 0, {}, |
||||
# cantaríamo -> cant |
||||
{"aríamo",2}, |
||||
# cantássemo -> cant |
||||
{"ássemo",2}, |
||||
# beberíamo -> beb |
||||
{"eríamo",2}, |
||||
# bebêssemo -> beb |
||||
{"êssemo",2}, |
||||
# partiríamo -> part |
||||
{"iríamo",3}, |
||||
# partíssemo -> part |
||||
{"íssemo",3}, |
||||
# cantáramo -> cant |
||||
{"áramo",2}, |
||||
# cantárei -> cant |
||||
{"árei",2}, |
||||
# cantaremo -> cant |
||||
{"aremo",2}, |
||||
# cantariam -> cant |
||||
{"ariam",2}, |
||||
# cantaríei -> cant |
||||
{"aríei",2}, |
||||
# cantássei -> cant |
||||
{"ássei",2}, |
||||
# cantassem -> cant |
||||
{"assem",2}, |
||||
# cantávamo -> cant |
||||
{"ávamo",2}, |
||||
# bebêramo -> beb |
||||
{"êramo",3}, |
||||
# beberemo -> beb |
||||
{"eremo",3}, |
||||
# beberiam -> beb |
||||
{"eriam",3}, |
||||
# beberíei -> beb |
||||
{"eríei",3}, |
||||
# bebêssei -> beb |
||||
{"êssei",3}, |
||||
# bebessem -> beb |
||||
{"essem",3}, |
||||
# partiríamo -> part |
||||
{"íramo",3}, |
||||
# partiremo -> part |
||||
{"iremo",3}, |
||||
# partiriam -> part |
||||
{"iriam",3}, |
||||
# partiríei -> part |
||||
{"iríei",3}, |
||||
# partíssei -> part |
||||
{"íssei",3}, |
||||
# partissem -> part |
||||
{"issem",3}, |
||||
# cantando -> cant |
||||
{"ando",2}, |
||||
# bebendo -> beb |
||||
{"endo",3}, |
||||
# partindo -> part |
||||
{"indo",3}, |
||||
# propondo -> prop |
||||
{"ondo",3}, |
||||
# cantaram -> cant |
||||
{"aram",2}, |
||||
{"arão",2}, |
||||
# cantarde -> cant |
||||
{"arde",2}, |
||||
# cantarei -> cant |
||||
{"arei",2}, |
||||
# cantarem -> cant |
||||
{"arem",2}, |
||||
# cantaria -> cant |
||||
{"aria",2}, |
||||
# cantarmo -> cant |
||||
{"armo",2}, |
||||
# cantasse -> cant |
||||
{"asse",2}, |
||||
# cantaste -> cant |
||||
{"aste",2}, |
||||
# cantavam -> cant |
||||
{"avam",2,"",{"agravam"}}, |
||||
# cantávei -> cant |
||||
{"ávei",2}, |
||||
# beberam -> beb |
||||
{"eram",3}, |
||||
{"erão",3}, |
||||
# beberde -> beb |
||||
{"erde",3}, |
||||
# beberei -> beb |
||||
{"erei",3}, |
||||
# bebêrei -> beb |
||||
{"êrei",3}, |
||||
# beberem -> beb |
||||
{"erem",3}, |
||||
# beberia -> beb |
||||
{"eria",3}, |
||||
# bebermo -> beb |
||||
{"ermo",3}, |
||||
# bebesse -> beb |
||||
{"esse",3}, |
||||
# bebeste -> beb |
||||
{"este",3,"",{"faroeste","agreste"}}, |
||||
# bebíamo -> beb |
||||
{"íamo",3}, |
||||
# partiram -> part |
||||
{"iram",3}, |
||||
# concluíram -> conclu |
||||
{"íram",3}, |
||||
{"irão",2}, |
||||
# partirde -> part |
||||
{"irde",2}, |
||||
# partírei -> part |
||||
{"irei",3,"",{"admirei"}}, |
||||
# partirem -> part |
||||
{"irem",3,"",{"adquirem"}}, |
||||
# partiria -> part |
||||
{"iria",3}, |
||||
# partirmo -> part |
||||
{"irmo",3}, |
||||
# partisse -> part |
||||
{"isse",3}, |
||||
# partiste -> part |
||||
{"iste",4}, |
||||
{"iava",4,"",{"ampliava"}}, |
||||
# cantamo -> cant |
||||
{"amo",2}, |
||||
{"iona",3}, |
||||
# cantara -> cant |
||||
{"ara",2,"",{"arara","prepara"}}, |
||||
# cantará -> cant |
||||
{"ará",2,"",{"alvará"}}, |
||||
# cantare -> cant |
||||
{"are",2,"",{"prepare"}}, |
||||
# cantava -> cant |
||||
{"ava",2,"",{"agrava"}}, |
||||
# cantemo -> cant |
||||
{"emo",2}, |
||||
# bebera -> beb |
||||
{"era",3,"",{"acelera","espera"}}, |
||||
# beberá -> beb |
||||
{"erá",3}, |
||||
# bebere -> beb |
||||
{"ere",3,"",{"espere"}}, |
||||
# bebiam -> beb |
||||
{"iam",3,"",{"enfiam","ampliam","elogiam","ensaiam"}}, |
||||
# bebíei -> beb |
||||
{"íei",3}, |
||||
# partimo -> part |
||||
{"imo",3,"",{"reprimo","intimo","íntimo","nimo","queimo","ximo"}}, |
||||
# partira -> part |
||||
{"ira",3,"",{"fronteira","sátira"}}, |
||||
{"ído",3}, |
||||
# partirá -> part |
||||
{"irá",3}, |
||||
{"tizar",4,"",{"alfabetizar"}}, |
||||
{"izar",5,"",{"organizar"}}, |
||||
{"itar",5,"",{"acreditar","explicitar","estreitar"}}, |
||||
# partire -> part |
||||
{"ire",3,"",{"adquire"}}, |
||||
# compomo -> comp |
||||
{"omo",3}, |
||||
# cantai -> cant |
||||
{"ai",2}, |
||||
# cantam -> cant |
||||
{"am",2}, |
||||
# barbear -> barb |
||||
{"ear",4,"",{"alardear","nuclear"}}, |
||||
# cantar -> cant |
||||
{"ar",2,"",{"azar","bazaar","patamar"}}, |
||||
# cheguei -> cheg |
||||
{"uei",3}, |
||||
{"uía",5,"u"}, |
||||
# cantei -> cant |
||||
{"ei",3}, |
||||
{"guem",3,"g"}, |
||||
# cantem -> cant |
||||
{"em",2,"",{"alem","virgem"}}, |
||||
# beber -> beb |
||||
{"er",2,"",{"éter","pier"}}, |
||||
# bebeu -> beb |
||||
{"eu",3,"",{"chapeu"}}, |
||||
# bebia -> beb |
||||
{"ia",3,"",{"estória","fatia","acia","praia","elogia","mania","lábia","aprecia","polícia","arredia","cheia","ásia"}}, |
||||
# partir -> part |
||||
{"ir",3,"",{"freir"}}, |
||||
# partiu -> part |
||||
{"iu",3}, |
||||
{"eou",5}, |
||||
# chegou -> cheg |
||||
{"ou",3}, |
||||
# bebi -> beb |
||||
{"i",3}}; |
||||
|
||||
# Step 7: Vowel Removal |
||||
{ "Vowel", 0, 0, {}, |
||||
{"bil",2,"vel"}, |
||||
{"gue",2,"g",{"gangue","jegue"}}, |
||||
{"á",3}, |
||||
{"ê",3,"",{"bebê"}}, |
||||
# menina -> menin |
||||
{"a",3,"",{"ásia"}}, |
||||
# grande -> grand |
||||
{"e",3}, |
||||
# menino -> menin |
||||
{"o",3,"",{"ão"}}}; |
@ -0,0 +1,233 @@
|
||||
# This file was created by Jacques Savoy and is distributed under the BSD license. |
||||
# See http://members.unine.ch/jacques.savoy/clef/index.html. |
||||
# Also see http://www.opensource.org/licenses/bsd-license.html |
||||
acea |
||||
aceasta |
||||
această |
||||
aceea |
||||
acei |
||||
aceia |
||||
acel |
||||
acela |
||||
acele |
||||
acelea |
||||
acest |
||||
acesta |
||||
aceste |
||||
acestea |
||||
aceşti |
||||
aceştia |
||||
acolo |
||||
acum |
||||
ai |
||||
aia |
||||
aibă |
||||
aici |
||||
al |
||||
ăla |
||||
ale |
||||
alea |
||||
ălea |
||||
altceva |
||||
altcineva |
||||
am |
||||
ar |
||||
are |
||||
aş |
||||
aşadar |
||||
asemenea |
||||
asta |
||||
ăsta |
||||
astăzi |
||||
astea |
||||
ăstea |
||||
ăştia |
||||
asupra |
||||
aţi |
||||
au |
||||
avea |
||||
avem |
||||
aveţi |
||||
azi |
||||
bine |
||||
bucur |
||||
bună |
||||
ca |
||||
că |
||||
căci |
||||
când |
||||
care |
||||
cărei |
||||
căror |
||||
cărui |
||||
cât |
||||
câte |
||||
câţi |
||||
către |
||||
câtva |
||||
ce |
||||
cel |
||||
ceva |
||||
chiar |
||||
cînd |
||||
cine |
||||
cineva |
||||
cît |
||||
cîte |
||||
cîţi |
||||
cîtva |
||||
contra |
||||
cu |
||||
cum |
||||
cumva |
||||
curând |
||||
curînd |
||||
da |
||||
dă |
||||
dacă |
||||
dar |
||||
datorită |
||||
de |
||||
deci |
||||
deja |
||||
deoarece |
||||
departe |
||||
deşi |
||||
din |
||||
dinaintea |
||||
dintr |
||||
dintre |
||||
drept |
||||
după |
||||
ea |
||||
ei |
||||
el |
||||
ele |
||||
eram |
||||
este |
||||
eşti |
||||
eu |
||||
face |
||||
fără |
||||
fi |
||||
fie |
||||
fiecare |
||||
fii |
||||
fim |
||||
fiţi |
||||
iar |
||||
ieri |
||||
îi |
||||
îl |
||||
îmi |
||||
împotriva |
||||
în |
||||
înainte |
||||
înaintea |
||||
încât |
||||
încît |
||||
încotro |
||||
între |
||||
întrucât |
||||
întrucît |
||||
îţi |
||||
la |
||||
lângă |
||||
le |
||||
li |
||||
lîngă |
||||
lor |
||||
lui |
||||
mă |
||||
mâine |
||||
mea |
||||
mei |
||||
mele |
||||
mereu |
||||
meu |
||||
mi |
||||
mine |
||||
mult |
||||
multă |
||||
mulţi |
||||
ne |
||||
nicăieri |
||||
nici |
||||
nimeni |
||||
nişte |
||||
noastră |
||||
noastre |
||||
noi |
||||
noştri |
||||
nostru |
||||
nu |
||||
ori |
||||
oricând |
||||
oricare |
||||
oricât |
||||
orice |
||||
oricînd |
||||
oricine |
||||
oricît |
||||
oricum |
||||
oriunde |
||||
până |
||||
pe |
||||
pentru |
||||
peste |
||||
pînă |
||||
poate |
||||
pot |
||||
prea |
||||
prima |
||||
primul |
||||
prin |
||||
printr |
||||
sa |
||||
să |
||||
săi |
||||
sale |
||||
sau |
||||
său |
||||
se |
||||
şi |
||||
sînt |
||||
sîntem |
||||
sînteţi |
||||
spre |
||||
sub |
||||
sunt |
||||
suntem |
||||
sunteţi |
||||
ta |
||||
tăi |
||||
tale |
||||
tău |
||||
te |
||||
ţi |
||||
ţie |
||||
tine |
||||
toată |
||||
toate |
||||
tot |
||||
toţi |
||||
totuşi |
||||
tu |
||||
un |
||||
una |
||||
unde |
||||
undeva |
||||
unei |
||||
unele |
||||
uneori |
||||
unor |
||||
vă |
||||
vi |
||||
voastră |
||||
voastre |
||||
voi |
||||
voştri |
||||
vostru |
||||
vouă |
||||
vreo |
||||
vreun |
@ -0,0 +1,108 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt |
||||
| This file is distributed under the BSD License. |
||||
| See http://snowball.tartarus.org/license.php |
||||
| Also see http://www.opensource.org/licenses/bsd-license.html |
||||
| - Encoding was converted to UTF-8. |
||||
| - This notice was added. |
||||
|
||||
| A Danish stop word list. Comments begin with vertical bar. Each stop |
||||
| word is at the start of a line. |
||||
|
||||
| This is a ranked list (commonest to rarest) of stopwords derived from |
||||
| a large text sample. |
||||
|
||||
|
||||
og | and |
||||
i | in |
||||
jeg | I |
||||
det | that (dem. pronoun)/it (pers. pronoun) |
||||
at | that (in front of a sentence)/to (with infinitive) |
||||
en | a/an |
||||
den | it (pers. pronoun)/that (dem. pronoun) |
||||
til | to/at/for/until/against/by/of/into, more |
||||
er | present tense of "to be" |
||||
som | who, as |
||||
på | on/upon/in/on/at/to/after/of/with/for, on |
||||
de | they |
||||
med | with/by/in, along |
||||
han | he |
||||
af | of/by/from/off/for/in/with/on, off |
||||
for | at/for/to/from/by/of/ago, in front/before, because |
||||
ikke | not |
||||
der | who/which, there/those |
||||
var | past tense of "to be" |
||||
mig | me/myself |
||||
sig | oneself/himself/herself/itself/themselves |
||||
men | but |
||||
et | a/an/one, one (number), someone/somebody/one |
||||
har | present tense of "to have" |
||||
om | round/about/for/in/a, about/around/down, if |
||||
vi | we |
||||
min | my |
||||
havde | past tense of "to have" |
||||
ham | him |
||||
hun | she |
||||
nu | now |
||||
over | over/above/across/by/beyond/past/on/about, over/past |
||||
da | then, when/as/since |
||||
fra | from/off/since, off, since |
||||
du | you |
||||
ud | out |
||||
sin | his/her/its/one's |
||||
dem | them |
||||
os | us/ourselves |
||||
op | up |
||||
man | you/one |
||||
hans | his |
||||
hvor | where |
||||
eller | or |
||||
hvad | what |
||||
skal | must/shall etc. |
||||
selv | myself/youself/herself/ourselves etc., even |
||||
her | here |
||||
alle | all/everyone/everybody etc. |
||||
vil | will (verb) |
||||
blev | past tense of "to stay/to remain/to get/to become" |
||||
kunne | could |
||||
ind | in |
||||
når | when |
||||
være | present tense of "to be" |
||||
dog | however/yet/after all |
||||
noget | something |
||||
ville | would |
||||
jo | you know/you see (adv), yes |
||||
deres | their/theirs |
||||
efter | after/behind/according to/for/by/from, later/afterwards |
||||
ned | down |
||||
skulle | should |
||||
denne | this |
||||
end | than |
||||
dette | this |
||||
mit | my/mine |
||||
også | also |
||||
under | under/beneath/below/during, below/underneath |
||||
have | have |
||||
dig | you |
||||
anden | other |
||||
hende | her |
||||
mine | my |
||||
alt | everything |
||||
meget | much/very, plenty of |
||||
sit | his, her, its, one's |
||||
sine | his, her, its, one's |
||||
vor | our |
||||
mod | against |
||||
disse | these |
||||
hvis | if |
||||
din | your/yours |
||||
nogle | some |
||||
hos | by/at |
||||
blive | be/become |
||||
mange | many |
||||
ad | by/through |
||||
bliver | present tense of "to be/to become" |
||||
hendes | her/hers |
||||
været | be |
||||
thi | for (conj) |
||||
jer | you |
||||
sådan | such, like this/like that |
@ -0,0 +1,117 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/dutch/stop.txt |
||||
| This file is distributed under the BSD License. |
||||
| See http://snowball.tartarus.org/license.php |
||||
| Also see http://www.opensource.org/licenses/bsd-license.html |
||||
| - Encoding was converted to UTF-8. |
||||
| - This notice was added. |
||||
|
||||
| A Dutch stop word list. Comments begin with vertical bar. Each stop |
||||
| word is at the start of a line. |
||||
|
||||
| This is a ranked list (commonest to rarest) of stopwords derived from |
||||
| a large sample of Dutch text. |
||||
|
||||
| Dutch stop words frequently exhibit homonym clashes. These are indicated |
||||
| clearly below. |
||||
|
||||
de | the |
||||
en | and |
||||
van | of, from |
||||
ik | I, the ego |
||||
te | (1) chez, at etc, (2) to, (3) too |
||||
dat | that, which |
||||
die | that, those, who, which |
||||
in | in, inside |
||||
een | a, an, one |
||||
hij | he |
||||
het | the, it |
||||
niet | not, nothing, naught |
||||
zijn | (1) to be, being, (2) his, one's, its |
||||
is | is |
||||
was | (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river |
||||
op | on, upon, at, in, up, used up |
||||
aan | on, upon, to (as dative) |
||||
met | with, by |
||||
als | like, such as, when |
||||
voor | (1) before, in front of, (2) furrow |
||||
had | had, past tense all persons sing. of 'hebben' (have) |
||||
er | there |
||||
maar | but, only |
||||
om | round, about, for etc |
||||
hem | him |
||||
dan | then |
||||
zou | should/would, past tense all persons sing. of 'zullen' |
||||
of | or, whether, if |
||||
wat | what, something, anything |
||||
mijn | possessive and noun 'mine' |
||||
men | people, 'one' |
||||
dit | this |
||||
zo | so, thus, in this way |
||||
door | through by |
||||
over | over, across |
||||
ze | she, her, they, them |
||||
zich | oneself |
||||
bij | (1) a bee, (2) by, near, at |
||||
ook | also, too |
||||
tot | till, until |
||||
je | you |
||||
mij | me |
||||
uit | out of, from |
||||
der | Old Dutch form of 'van der' still found in surnames |
||||
daar | (1) there, (2) because |
||||
haar | (1) her, their, them, (2) hair |
||||
naar | (1) unpleasant, unwell etc, (2) towards, (3) as |
||||
heb | present first person sing. of 'to have' |
||||
hoe | how, why |
||||
heeft | present third person sing. of 'to have' |
||||
hebben | 'to have' and various parts thereof |
||||
deze | this |
||||
u | you |
||||
want | (1) for, (2) mitten, (3) rigging |
||||
nog | yet, still |
||||
zal | 'shall', first and third person sing. of verb 'zullen' (will) |
||||
me | me |
||||
zij | she, they |
||||
nu | now |
||||
ge | 'thou', still used in Belgium and south Netherlands |
||||
geen | none |
||||
omdat | because |
||||
iets | something, somewhat |
||||
worden | to become, grow, get |
||||
toch | yet, still |
||||
al | all, every, each |
||||
waren | (1) 'were' (2) to wander, (3) wares, (3) |
||||
veel | much, many |
||||
meer | (1) more, (2) lake |
||||
doen | to do, to make |
||||
toen | then, when |
||||
moet | noun 'spot/mote' and present form of 'to must' |
||||
ben | (1) am, (2) 'are' in interrogative second person singular of 'to be' |
||||
zonder | without |
||||
kan | noun 'can' and present form of 'to be able' |
||||
hun | their, them |
||||
dus | so, consequently |
||||
alles | all, everything, anything |
||||
onder | under, beneath |
||||
ja | yes, of course |
||||
eens | once, one day |
||||
hier | here |
||||
wie | who |
||||
werd | imperfect third person sing. of 'become' |
||||
altijd | always |
||||
doch | yet, but etc |
||||
wordt | present third person sing. of 'become' |
||||
wezen | (1) to be, (2) 'been' as in 'been fishing', (3) orphans |
||||
kunnen | to be able |
||||
ons | us/our |
||||
zelf | self |
||||
tegen | against, towards, at |
||||
na | after, near |
||||
reeds | already |
||||
wil | (1) present tense of 'want', (2) 'will', noun, (3) fender |
||||
kon | could; past tense of 'to be able' |
||||
niets | nothing |
||||
uw | your |
||||
iemand | somebody |
||||
geweest | been; past participle of 'be' |
||||
andere | other |
@ -0,0 +1,317 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/english/stop.txt |
||||
| This file is distributed under the BSD License. |
||||
| See http://snowball.tartarus.org/license.php |
||||
| Also see http://www.opensource.org/licenses/bsd-license.html |
||||
| - Encoding was converted to UTF-8. |
||||
| - This notice was added. |
||||
|
||||
| An English stop word list. Comments begin with vertical bar. Each stop |
||||
| word is at the start of a line. |
||||
|
||||
| Many of the forms below are quite rare (e.g. "yourselves") but included for |
||||
| completeness. |
||||
|
||||
| PRONOUNS FORMS |
||||
| 1st person sing |
||||
|
||||
i | subject, always in upper case of course |
||||
|
||||
me | object |
||||
my | possessive adjective |
||||
| the possessive pronoun `mine' is best suppressed, because of the |
||||
| sense of coal-mine etc. |
||||
myself | reflexive |
||||
| 1st person plural |
||||
we | subject |
||||
|
||||
| us | object |
||||
| care is required here because US = United States. It is usually |
||||
| safe to remove it if it is in lower case. |
||||
our | possessive adjective |
||||
ours | possessive pronoun |
||||
ourselves | reflexive |
||||
| second person (archaic `thou' forms not included) |
||||
you | subject and object |
||||
your | possessive adjective |
||||
yours | possessive pronoun |
||||
yourself | reflexive (singular) |
||||
yourselves | reflexive (plural) |
||||
| third person singular |
||||
he | subject |
||||
him | object |
||||
his | possessive adjective and pronoun |
||||
himself | reflexive |
||||
|
||||
she | subject |
||||
her | object and possessive adjective |
||||
hers | possessive pronoun |
||||
herself | reflexive |
||||
|
||||
it | subject and object |
||||
its | possessive adjective |
||||
itself | reflexive |
||||
| third person plural |
||||
they | subject |
||||
them | object |
||||
their | possessive adjective |
||||
theirs | possessive pronoun |
||||
themselves | reflexive |
||||
| other forms (demonstratives, interrogatives) |
||||
what |
||||
which |
||||
who |
||||
whom |
||||
this |
||||
that |
||||
these |
||||
those |
||||
|
||||
| VERB FORMS (using F.R. Palmer's nomenclature) |
||||
| BE |
||||
am | 1st person, present |
||||
is | -s form (3rd person, present) |
||||
are | present |
||||
was | 1st person, past |
||||
were | past |
||||
be | infinitive |
||||
been | past participle |
||||
being | -ing form |
||||
| HAVE |
||||
have | simple |
||||
has | -s form |
||||
had | past |
||||
having | -ing form |
||||
| DO |
||||
do | simple |
||||
does | -s form |
||||
did | past |
||||
doing | -ing form |
||||
|
||||
| The forms below are, I believe, best omitted, because of the significant |
||||
| homonym forms: |
||||
|
||||
| He made a WILL |
||||
| old tin CAN |
||||
| merry month of MAY |
||||
| a smell of MUST |
||||
| fight the good fight with all thy MIGHT |
||||
|
||||
| would, could, should, ought might however be included |
||||
|
||||
| | AUXILIARIES |
||||
| | WILL |
||||
|will |
||||
|
||||
would |
||||
|
||||
| | SHALL |
||||
|shall |
||||
|
||||
should |
||||
|
||||
| | CAN |
||||
|can |
||||
|
||||
could |
||||
|
||||
| | MAY |
||||
|may |
||||
|might |
||||
| | MUST |
||||
|must |
||||
| | OUGHT |
||||
|
||||
ought |
||||
|
||||
| COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing |
||||
| pronoun + verb |
||||
|
||||
i'm |
||||
you're |
||||
he's |
||||
she's |
||||
it's |
||||
we're |
||||
they're |
||||
i've |
||||
you've |
||||
we've |
||||
they've |
||||
i'd |
||||
you'd |
||||
he'd |
||||
she'd |
||||
we'd |
||||
they'd |
||||
i'll |
||||
you'll |
||||
he'll |
||||
she'll |
||||
we'll |
||||
they'll |
||||
|
||||
| verb + negation |
||||
|
||||
isn't |
||||
aren't |
||||
wasn't |
||||
weren't |
||||
hasn't |
||||
haven't |
||||
hadn't |
||||
doesn't |
||||
don't |
||||
didn't |
||||
|
||||
| auxiliary + negation |
||||
|
||||
won't |
||||
wouldn't |
||||
shan't |
||||
shouldn't |
||||
can't |
||||
cannot |
||||
couldn't |
||||
mustn't |
||||
|
||||
| miscellaneous forms |
||||
|
||||
let's |
||||
that's |
||||
who's |
||||
what's |
||||
here's |
||||
there's |
||||
when's |
||||
where's |
||||
why's |
||||
how's |
||||
|
||||
| rarer forms |
||||
|
||||
| daren't needn't |
||||
|
||||
| doubtful forms |
||||
|
||||
| oughtn't mightn't |
||||
|
||||
| ARTICLES |
||||
a |
||||
an |
||||
the |
||||
|
||||
| THE REST (Overlap among prepositions, conjunctions, adverbs etc is so |
||||
| high, that classification is pointless.) |
||||
and |
||||
but |
||||
if |
||||
or |
||||
because |
||||
as |
||||
until |
||||
while |
||||
|
||||
of |
||||
at |
||||
by |
||||
for |
||||
with |
||||
about |
||||
against |
||||
between |
||||
into |
||||
through |
||||
during |
||||
before |
||||
after |
||||
above |
||||
below |
||||
to |
||||
from |
||||
up |
||||
down |
||||
in |
||||
out |
||||
on |
||||
off |
||||
over |
||||
under |
||||
|
||||
again |
||||
further |
||||
then |
||||
once |
||||
|
||||
here |
||||
there |
||||
when |
||||
where |
||||
why |
||||
how |
||||
|
||||
all |
||||
any |
||||
both |
||||
each |
||||
few |
||||
more |
||||
most |
||||
other |
||||
some |
||||
such |
||||
|
||||
no |
||||
nor |
||||
not |
||||
only |
||||
own |
||||
same |
||||
so |
||||
than |
||||
too |
||||
very |
||||
|
||||
| Just for the record, the following words are among the commonest in English |
||||
|
||||
| one |
||||
| every |
||||
| least |
||||
| less |
||||
| many |
||||
| now |
||||
| ever |
||||
| never |
||||
| say |
||||
| says |
||||
| said |
||||
| also |
||||
| get |
||||
| go |
||||
| goes |
||||
| just |
||||
| made |
||||
| make |
||||
| put |
||||
| see |
||||
| seen |
||||
| whether |
||||
| like |
||||
| well |
||||
| back |
||||
| even |
||||
| still |
||||
| way |
||||
| take |
||||
| since |
||||
| another |
||||
| however |
||||
| two |
||||
| three |
||||
| four |
||||
| five |
||||
| first |
||||
| second |
||||
| new |
||||
| old |
||||
| high |
||||
| long |
@ -0,0 +1,95 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt |
||||
| This file is distributed under the BSD License. |
||||
| See http://snowball.tartarus.org/license.php |
||||
| Also see http://www.opensource.org/licenses/bsd-license.html |
||||
| - Encoding was converted to UTF-8. |
||||
| - This notice was added. |
||||
|
||||
| forms of BE |
||||
|
||||
olla |
||||
olen |
||||
olet |
||||
on |
||||
olemme |
||||
olette |
||||
ovat |
||||
ole | negative form |
||||
|
||||
oli |
||||
olisi |
||||
olisit |
||||
olisin |
||||
olisimme |
||||
olisitte |
||||
olisivat |
||||
olit |
||||
olin |
||||
olimme |
||||
olitte |
||||
olivat |
||||
ollut |
||||
olleet |
||||
|
||||
en | negation |
||||
et |
||||
ei |
||||
emme |
||||
ette |
||||
eivät |
||||
|
||||
|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans |
||||
minä minun minut minua minussa minusta minuun minulla minulta minulle | I |
||||
sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you |
||||
hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she |
||||
me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we |
||||
te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you |
||||
he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they |
||||
|
||||
tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this |
||||
tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that |
||||
se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it |
||||
nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these |
||||
nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those |
||||
ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they |
||||
|
||||
kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who |
||||
ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl) |
||||
mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what |
||||
mitkä | (pl) |
||||
|
||||
joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which |
||||
jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl) |
||||
|
||||
| conjunctions |
||||
|
||||
että | that |
||||
ja | and |
||||
jos | if |
||||
koska | because |
||||
kuin | than |
||||
mutta | but |
||||
niin | so |
||||
sekä | and |
||||
sillä | for |
||||
tai | or |
||||
vaan | but |
||||
vai | or |
||||
vaikka | although |
||||
|
||||
|
||||
| prepositions |
||||
|
||||
kanssa | with |
||||
mukaan | according to |
||||
noin | about |
||||
poikki | across |
||||
yli | over, across |
||||
|
||||
| other |
||||
|
||||
kun | when |
||||
niin | so |
||||
nyt | now |
||||
itse | self |
||||
|
@ -0,0 +1,292 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt |
||||
| This file is distributed under the BSD License. |
||||
| See http://snowball.tartarus.org/license.php |
||||
| Also see http://www.opensource.org/licenses/bsd-license.html |
||||
| - Encoding was converted to UTF-8. |
||||
| - This notice was added. |
||||
|
||||
| A German stop word list. Comments begin with vertical bar. Each stop |
||||
| word is at the start of a line. |
||||
|
||||
| The number of forms in this list is reduced significantly by passing it |
||||
| through the German stemmer. |
||||
|
||||
|
||||
aber | but |
||||
|
||||
alle | all |
||||
allem |
||||
allen |
||||
aller |
||||
alles |
||||
|
||||
als | than, as |
||||
also | so |
||||
am | an + dem |
||||
an | at |
||||
|
||||
ander | other |
||||
andere |
||||
anderem |
||||
anderen |
||||
anderer |
||||
anderes |
||||
anderm |
||||
andern |
||||
anderr |
||||
anders |
||||
|
||||
auch | also |
||||
auf | on |
||||
aus | out of |
||||
bei | by |
||||
bin | am |
||||
bis | until |
||||
bist | art |
||||
da | there |
||||
damit | with it |
||||
dann | then |
||||
|
||||
der | the |
||||
den |
||||
des |
||||
dem |
||||
die |
||||
das |
||||
|
||||
daß | that |
||||
|
||||
derselbe | the same |
||||
derselben |
||||
denselben |
||||
desselben |
||||
demselben |
||||
dieselbe |
||||
dieselben |
||||
dasselbe |
||||
|
||||
dazu | to that |
||||
|
||||
dein | thy |
||||
deine |
||||
deinem |
||||
deinen |
||||
deiner |
||||
deines |
||||
|
||||
denn | because |
||||
|
||||
derer | of those |
||||
dessen | of him |
||||
|
||||
dich | thee |
||||
dir | to thee |
||||
du | thou |
||||
|
||||
dies | this |
||||
diese |
||||
diesem |
||||
diesen |
||||
dieser |
||||
dieses |
||||
|
||||
|
||||
doch | (several meanings) |
||||
dort | (over) there |
||||
|
||||
|
||||
durch | through |
||||
|
||||
ein | a |
||||
eine |
||||
einem |
||||
einen |
||||
einer |
||||
eines |
||||
|
||||
einig | some |
||||
einige |
||||
einigem |
||||
einigen |
||||
einiger |
||||
einiges |
||||
|
||||
einmal | once |
||||
|
||||
er | he |
||||
ihn | him |
||||
ihm | to him |
||||
|
||||
es | it |
||||
etwas | something |
||||
|
||||
euer | your |
||||
eure |
||||
eurem |
||||
euren |
||||
eurer |
||||
eures |
||||
|
||||
für | for |
||||
gegen | towards |
||||
gewesen | p.p. of sein |
||||
hab | have |
||||
habe | have |
||||
haben | have |
||||
hat | has |
||||
hatte | had |
||||
hatten | had |
||||
hier | here |
||||
hin | there |
||||
hinter | behind |
||||
|
||||
ich | I |
||||
mich | me |
||||
mir | to me |
||||
|
||||
|
||||
ihr | you, to her |
||||
ihre |
||||
ihrem |
||||
ihren |
||||
ihrer |
||||
ihres |
||||
euch | to you |
||||
|
||||
im | in + dem |
||||
in | in |
||||
indem | while |
||||
ins | in + das |
||||
ist | is |
||||
|
||||
jede | each, every |
||||
jedem |
||||
jeden |
||||
jeder |
||||
jedes |
||||
|
||||
jene | that |
||||
jenem |
||||
jenen |
||||
jener |
||||
jenes |
||||
|
||||
jetzt | now |
||||
kann | can |
||||
|
||||
kein | no |
||||
keine |
||||
keinem |
||||
keinen |
||||
keiner |
||||
keines |
||||
|
||||
können | can |
||||
könnte | could |
||||
machen | do |
||||
man | one |
||||
|
||||
manche | some, many a |
||||
manchem |
||||
manchen |
||||
mancher |
||||
manches |
||||
|
||||
mein | my |
||||
meine |
||||
meinem |
||||
meinen |
||||
meiner |
||||
meines |
||||
|
||||
mit | with |
||||
muss | must |
||||
musste | had to |
||||
nach | to(wards) |
||||
nicht | not |
||||
nichts | nothing |
||||
noch | still, yet |
||||
nun | now |
||||
nur | only |
||||
ob | whether |
||||
oder | or |
||||
ohne | without |
||||
sehr | very |
||||
|
||||
sein | his |
||||
seine |
||||
seinem |
||||
seinen |
||||
seiner |
||||
seines |
||||
|
||||
selbst | self |
||||
sich | herself |
||||
|
||||
sie | they, she |
||||
ihnen | to them |
||||
|
||||
sind | are |
||||
so | so |
||||
|
||||
solche | such |
||||
solchem |
||||
solchen |
||||
solcher |
||||
solches |
||||
|
||||
soll | shall |
||||
sollte | should |
||||
sondern | but |
||||
sonst | else |
||||
über | over |
||||
um | about, around |
||||
und | and |
||||
|
||||
uns | us |
||||
unse |
||||
unsem |
||||
unsen |
||||
unser |
||||
unses |
||||
|
||||
unter | under |
||||
viel | much |
||||
vom | von + dem |
||||
von | from |
||||
vor | before |
||||
während | while |
||||
war | was |
||||
waren | were |
||||
warst | wast |
||||
was | what |
||||
weg | away, off |
||||
weil | because |
||||
weiter | further |
||||
|
||||
welche | which |
||||
welchem |
||||
welchen |
||||
welcher |
||||
welches |
||||
|
||||
wenn | when |
||||
werde | will |
||||
werden | will |
||||
wie | how |
||||
wieder | again |
||||
will | want |
||||
wir | we |
||||
wird | will |
||||
wirst | willst |
||||
wo | where |
||||
wollen | want |
||||
wollte | wanted |
||||
würde | would |
||||
würden | would |
||||
zu | to |
||||
zum | zu + dem |
||||
zur | zu + der |
||||
zwar | indeed |
||||
zwischen | between |
||||
|
@ -0,0 +1,209 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt |
||||
| This file is distributed under the BSD License. |
||||
| See http://snowball.tartarus.org/license.php |
||||
| Also see http://www.opensource.org/licenses/bsd-license.html |
||||
| - Encoding was converted to UTF-8. |
||||
| - This notice was added. |
||||
|
||||
| Hungarian stop word list |
||||
| prepared by Anna Tordai |
||||
|
||||
a |
||||
ahogy |
||||
ahol |
||||
aki |
||||
akik |
||||
akkor |
||||
alatt |
||||
által |
||||
általában |
||||
amely |
||||
amelyek |
||||
amelyekben |
||||
amelyeket |
||||
amelyet |
||||
amelynek |
||||
ami |
||||
amit |
||||
amolyan |
||||
amíg |
||||
amikor |
||||
át |
||||
abban |
||||
ahhoz |
||||
annak |
||||
arra |
||||
arról |
||||
az |
||||
azok |
||||
azon |
||||
azt |
||||
azzal |
||||
azért |
||||
aztán |
||||
azután |
||||
azonban |
||||
bár |
||||
be |
||||
belül |
||||
benne |
||||
cikk |
||||
cikkek |
||||
cikkeket |
||||
csak |
||||
de |
||||
e |
||||
eddig |
||||
egész |
||||
egy |
||||
egyes |
||||
egyetlen |
||||
egyéb |
||||
egyik |
||||
egyre |
||||
ekkor |
||||
el |
||||
elég |
||||
ellen |
||||
elő |
||||
először |
||||
előtt |
||||
első |
||||
én |
||||
éppen |
||||
ebben |
||||
ehhez |
||||
emilyen |
||||
ennek |
||||
erre |
||||
ez |
||||
ezt |
||||
ezek |
||||
ezen |
||||
ezzel |
||||
ezért |
||||
és |
||||
fel |
||||
felé |
||||
hanem |
||||
hiszen |
||||
hogy |
||||
hogyan |
||||
igen |
||||
így |
||||
illetve |
||||
ill. |
||||
ill |
||||
ilyen |
||||
ilyenkor |
||||
ison |
||||
ismét |
||||
itt |
||||
jó |
||||
jól |
||||
jobban |
||||
kell |
||||
kellett |
||||
keresztül |
||||
keressünk |
||||
ki |
||||
kívül |
||||
között |
||||
közül |
||||
legalább |
||||
lehet |
||||
lehetett |
||||
legyen |
||||
lenne |
||||
lenni |
||||
lesz |
||||
lett |
||||
maga |
||||
magát |
||||
majd |
||||
majd |
||||
már |
||||
más |
||||
másik |
||||
meg |
||||
még |
||||
mellett |
||||
mert |
||||
mely |
||||
melyek |
||||
mi |
||||
mit |
||||
míg |
||||
miért |
||||
milyen |
||||
mikor |
||||
minden |
||||
mindent |
||||
mindenki |
||||
mindig |
||||
mint |
||||
mintha |
||||
mivel |
||||
most |
||||
nagy |
||||
nagyobb |
||||
nagyon |
||||
ne |
||||
néha |
||||
nekem |
||||
neki |
||||
nem |
||||
néhány |
||||
nélkül |
||||
nincs |
||||
olyan |
||||
ott |
||||
össze |
||||
ő |
||||
ők |
||||
őket |
||||
pedig |
||||
persze |
||||
rá |
||||
s |
||||
saját |
||||
sem |
||||
semmi |
||||
sok |
||||
sokat |
||||
sokkal |
||||
számára |
||||
szemben |
||||
szerint |
||||
szinte |
||||
talán |
||||
tehát |
||||
teljes |
||||
tovább |
||||
továbbá |
||||
több |
||||
úgy |
||||
ugyanis |
||||
új |
||||
újabb |
||||
újra |
||||
után |
||||
utána |
||||
utolsó |
||||
vagy |
||||
vagyis |
||||
valaki |
||||
valami |
||||
valamint |
||||
való |
||||
vagyok |
||||
van |
||||
vannak |
||||
volt |
||||
voltam |
||||
voltak |
||||
voltunk |
||||
vissza |
||||
vele |
||||
viszont |
||||
volna |
@ -0,0 +1,301 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/italian/stop.txt |
||||
| This file is distributed under the BSD License. |
||||
| See http://snowball.tartarus.org/license.php |
||||
| Also see http://www.opensource.org/licenses/bsd-license.html |
||||
| - Encoding was converted to UTF-8. |
||||
| - This notice was added. |
||||
|
||||
| An Italian stop word list. Comments begin with vertical bar. Each stop |
||||
| word is at the start of a line. |
||||
|
||||
ad | a (to) before vowel |
||||
al | a + il |
||||
allo | a + lo |
||||
ai | a + i |
||||
agli | a + gli |
||||
all | a + l' |
||||
agl | a + gl' |
||||
alla | a + la |
||||
alle | a + le |
||||
con | with |
||||
col | con + il |
||||
coi | con + i (forms collo, cogli etc are now very rare) |
||||
da | from |
||||
dal | da + il |
||||
dallo | da + lo |
||||
dai | da + i |
||||
dagli | da + gli |
||||
dall | da + l' |
||||
dagl | da + gll' |
||||
dalla | da + la |
||||
dalle | da + le |
||||
di | of |
||||
del | di + il |
||||
dello | di + lo |
||||
dei | di + i |
||||
degli | di + gli |
||||
dell | di + l' |
||||
degl | di + gl' |
||||
della | di + la |
||||
delle | di + le |
||||
in | in |
||||
nel | in + el |
||||
nello | in + lo |
||||
nei | in + i |
||||
negli | in + gli |
||||
nell | in + l' |
||||
negl | in + gl' |
||||
nella | in + la |
||||
nelle | in + le |
||||
su | on |
||||
sul | su + il |
||||
sullo | su + lo |
||||
sui | su + i |
||||
sugli | su + gli |
||||
sull | su + l' |
||||
sugl | su + gl' |
||||
sulla | su + la |
||||
sulle | su + le |
||||
per | through, by |
||||
tra | among |
||||
contro | against |
||||
io | I |
||||
tu | thou |
||||
lui | he |
||||
lei | she |
||||
noi | we |
||||
voi | you |
||||
loro | they |
||||
mio | my |
||||
mia | |
||||
miei | |
||||
mie | |
||||
tuo | |
||||
tua | |
||||
tuoi | thy |
||||
tue | |
||||
suo | |
||||
sua | |
||||
suoi | his, her |
||||
sue | |
||||
nostro | our |
||||
nostra | |
||||
nostri | |
||||
nostre | |
||||
vostro | your |
||||
vostra | |
||||
vostri | |
||||
vostre | |
||||
mi | me |
||||
ti | thee |
||||
ci | us, there |
||||
vi | you, there |
||||
lo | him, the |
||||
la | her, the |
||||
li | them |
||||
le | them, the |
||||
gli | to him, the |
||||
ne | from there etc |
||||
il | the |
||||
un | a |
||||
uno | a |
||||
una | a |
||||
ma | but |
||||
ed | and |
||||
se | if |
||||
perché | why, because |
||||
anche | also |
||||
come | how |
||||
dov | where (as dov') |
||||
dove | where |
||||
che | who, that |
||||
chi | who |
||||
cui | whom |
||||
non | not |
||||
più | more |
||||
quale | who, that |
||||
quanto | how much |
||||
quanti | |
||||
quanta | |
||||
quante | |
||||
quello | that |
||||
quelli | |
||||
quella | |
||||
quelle | |
||||
questo | this |
||||
questi | |
||||
questa | |
||||
queste | |
||||
si | yes |
||||
tutto | all |
||||
tutti | all |
||||
|
||||
| single letter forms: |
||||
|
||||
a | at |
||||
c | as c' for ce or ci |
||||
e | and |
||||
i | the |
||||
l | as l' |
||||
o | or |
||||
|
||||
| forms of avere, to have (not including the infinitive): |
||||
|
||||
ho |
||||
hai |
||||
ha |
||||
abbiamo |
||||
avete |
||||
hanno |
||||
abbia |
||||
abbiate |
||||
abbiano |
||||
avrò |
||||
avrai |
||||
avrà |
||||
avremo |
||||
avrete |
||||
avranno |
||||
avrei |
||||
avresti |
||||
avrebbe |
||||
avremmo |
||||
avreste |
||||
avrebbero |
||||
avevo |
||||
avevi |
||||
aveva |
||||
avevamo |
||||
avevate |
||||
avevano |
||||
ebbi |
||||
avesti |
||||
ebbe |
||||
avemmo |
||||
aveste |
||||
ebbero |
||||
avessi |
||||
avesse |
||||
avessimo |
||||
avessero |
||||
avendo |
||||
avuto |
||||
avuta |
||||
avuti |
||||
avute |
||||
|
||||
| forms of essere, to be (not including the infinitive): |
||||
sono |
||||
sei |
||||
è |
||||
siamo |
||||
siete |
||||
sia |
||||
siate |
||||
siano |
||||
sarò |
||||
sarai |
||||
sarà |
||||
saremo |
||||
sarete |
||||
saranno |
||||
sarei |
||||
saresti |
||||
sarebbe |
||||
saremmo |
||||
sareste |
||||
sarebbero |
||||
ero |
||||
eri |
||||
era |
||||
eravamo |
||||
eravate |
||||
erano |
||||
fui |
||||
fosti |
||||
fu |
||||
fummo |
||||
foste |
||||
furono |
||||
fossi |
||||
fosse |
||||
fossimo |
||||
fossero |
||||
essendo |
||||
|
||||
| forms of fare, to do (not including the infinitive, fa, fat-): |
||||
faccio |
||||
fai |
||||
facciamo |
||||
fanno |
||||
faccia |
||||
facciate |
||||
facciano |
||||
farò |
||||
farai |
||||
farà |
||||
faremo |
||||
farete |
||||
faranno |
||||
farei |
||||
faresti |
||||
farebbe |
||||
faremmo |
||||
fareste |
||||
farebbero |
||||
facevo |
||||
facevi |
||||
faceva |
||||
facevamo |
||||
facevate |
||||
facevano |
||||
feci |
||||
facesti |
||||
fece |
||||
facemmo |
||||
faceste |
||||
fecero |
||||
facessi |
||||
facesse |
||||
facessimo |
||||
facessero |
||||
facendo |
||||
|
||||
| forms of stare, to be (not including the infinitive): |
||||
sto |
||||
stai |
||||
sta |
||||
stiamo |
||||
stanno |
||||
stia |
||||
stiate |
||||
stiano |
||||
starò |
||||
starai |
||||
starà |
||||
staremo |
||||
starete |
||||
staranno |
||||
starei |
||||
staresti |
||||
starebbe |
||||
staremmo |
||||
stareste |
||||
starebbero |
||||
stavo |
||||
stavi |
||||
stava |
||||
stavamo |
||||
stavate |
||||
stavano |
||||
stetti |
||||
stesti |
||||
stette |
||||
stemmo |
||||
steste |
||||
stettero |
||||
stessi |
||||
stesse |
||||
stessimo |
||||
stessero |
||||
stando |
@ -0,0 +1,192 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/norwegian/stop.txt |
||||
| This file is distributed under the BSD License. |
||||
| See http://snowball.tartarus.org/license.php |
||||
| Also see http://www.opensource.org/licenses/bsd-license.html |
||||
| - Encoding was converted to UTF-8. |
||||
| - This notice was added. |
||||
|
||||
| A Norwegian stop word list. Comments begin with vertical bar. Each stop |
||||
| word is at the start of a line. |
||||
|
||||
| This stop word list is for the dominant bokmål dialect. Words unique |
||||
| to nynorsk are marked *. |
||||
|
||||
| Revised by Jan Bruusgaard <Jan.Bruusgaard@ssb.no>, Jan 2005 |
||||
|
||||
og | and |
||||
i | in |
||||
jeg | I |
||||
det | it/this/that |
||||
at | to (w. inf.) |
||||
en | a/an |
||||
et | a/an |
||||
den | it/this/that |
||||
til | to |
||||
er | is/am/are |
||||
som | who/that |
||||
på | on |
||||
de | they / you(formal) |
||||
med | with |
||||
han | he |
||||
av | of |
||||
ikke | not |
||||
ikkje | not * |
||||
der | there |
||||
så | so |
||||
var | was/were |
||||
meg | me |
||||
seg | you |
||||
men | but |
||||
ett | one |
||||
har | have |
||||
om | about |
||||
vi | we |
||||
min | my |
||||
mitt | my |
||||
ha | have |
||||
hadde | had |
||||
hun | she |
||||
nå | now |
||||
over | over |
||||
da | when/as |
||||
ved | by/know |
||||
fra | from |
||||
du | you |
||||
ut | out |
||||
sin | your |
||||
dem | them |
||||
oss | us |
||||
opp | up |
||||
man | you/one |
||||
kan | can |
||||
hans | his |
||||
hvor | where |
||||
eller | or |
||||
hva | what |
||||
skal | shall/must |
||||
selv | self (reflective) |
||||
sjøl | self (reflective) |
||||
her | here |
||||
alle | all |
||||
vil | will |
||||
bli | become |
||||
ble | became |
||||
blei | became * |
||||
blitt | have become |
||||
kunne | could |
||||
inn | in |
||||
når | when |
||||
være | be |
||||
kom | come |
||||
noen | some |
||||
noe | some |
||||
ville | would |
||||
dere | you |
||||
som | who/which/that |
||||
deres | their/theirs |
||||
kun | only/just |
||||
ja | yes |
||||
etter | after |
||||
ned | down |
||||
skulle | should |
||||
denne | this |
||||
for | for/because |
||||
deg | you |
||||
si | hers/his |
||||
sine | hers/his |
||||
sitt | hers/his |
||||
mot | against |
||||
å | to |
||||
meget | much |
||||
hvorfor | why |
||||
dette | this |
||||
disse | these/those |
||||
uten | without |
||||
hvordan | how |
||||
ingen | none |
||||
din | your |
||||
ditt | your |
||||
blir | become |
||||
samme | same |
||||
hvilken | which |
||||
hvilke | which (plural) |
||||
sånn | such a |
||||
inni | inside/within |
||||
mellom | between |
||||
vår | our |
||||
hver | each |
||||
hvem | who |
||||
vors | us/ours |
||||
hvis | whose |
||||
både | both |
||||
bare | only/just |
||||
enn | than |
||||
fordi | as/because |
||||
før | before |
||||
mange | many |
||||
også | also |
||||
slik | just |
||||
vært | been |
||||
være | to be |
||||
båe | both * |
||||
begge | both |
||||
siden | since |
||||
dykk | your * |
||||
dykkar | yours * |
||||
dei | they * |
||||
deira | them * |
||||
deires | theirs * |
||||
deim | them * |
||||
di | your (fem.) * |
||||
då | as/when * |
||||
eg | I * |
||||
ein | a/an * |
||||
eit | a/an * |
||||
eitt | a/an * |
||||
elles | or * |
||||
honom | he * |
||||
hjå | at * |
||||
ho | she * |
||||
hoe | she * |
||||
henne | her |
||||
hennar | her/hers |
||||
hennes | hers |
||||
hoss | how * |
||||
hossen | how * |
||||
ikkje | not * |
||||
ingi | noone * |
||||
inkje | noone * |
||||
korleis | how * |
||||
korso | how * |
||||
kva | what/which * |
||||
kvar | where * |
||||
kvarhelst | where * |
||||
kven | who/whom * |
||||
kvi | why * |
||||
kvifor | why * |
||||
me | we * |
||||
medan | while * |
||||
mi | my * |
||||
mine | my * |
||||
mykje | much * |
||||
no | now * |
||||
nokon | some (masc./neut.) * |
||||
noka | some (fem.) * |
||||
nokor | some * |
||||
noko | some * |
||||
nokre | some * |
||||
si | his/hers * |
||||
sia | since * |
||||
sidan | since * |
||||
so | so * |
||||
somt | some * |
||||
somme | some * |
||||
um | about* |
||||
upp | up * |
||||
vere | be * |
||||
vore | was * |
||||
verte | become * |
||||
vort | become * |
||||
varte | became * |
||||
vart | became * |
||||
|
@ -0,0 +1,251 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/portuguese/stop.txt |
||||
| This file is distributed under the BSD License. |
||||
| See http://snowball.tartarus.org/license.php |
||||
| Also see http://www.opensource.org/licenses/bsd-license.html |
||||
| - Encoding was converted to UTF-8. |
||||
| - This notice was added. |
||||
|
||||
| A Portuguese stop word list. Comments begin with vertical bar. Each stop |
||||
| word is at the start of a line. |
||||
|
||||
|
||||
| The following is a ranked list (commonest to rarest) of stopwords |
||||
| deriving from a large sample of text. |
||||
|
||||
| Extra words have been added at the end. |
||||
|
||||
de | of, from |
||||
a | the; to, at; her |
||||
o | the; him |
||||
que | who, that |
||||
e | and |
||||
do | de + o |
||||
da | de + a |
||||
em | in |
||||
um | a |
||||
para | for |
||||
| é from SER |
||||
com | with |
||||
não | not, no |
||||
uma | a |
||||
os | the; them |
||||
no | em + o |
||||
se | himself etc |
||||
na | em + a |
||||
por | for |
||||
mais | more |
||||
as | the; them |
||||
dos | de + os |
||||
como | as, like |
||||
mas | but |
||||
| foi from SER |
||||
ao | a + o |
||||
ele | he |
||||
das | de + as |
||||
| tem from TER |
||||
à | a + a |
||||
seu | his |
||||
sua | her |
||||
ou | or |
||||
| ser from SER |
||||
quando | when |
||||
muito | much |
||||
| há from HAV |
||||
nos | em + os; us |
||||
já | already, now |
||||
| está from EST |
||||
eu | I |
||||
também | also |
||||
só | only, just |
||||
pelo | per + o |
||||
pela | per + a |
||||
até | up to |
||||
isso | that |
||||
ela | he |
||||
entre | between |
||||
| era from SER |
||||
depois | after |
||||
sem | without |
||||
mesmo | same |
||||
aos | a + os |
||||
| ter from TER |
||||
seus | his |
||||
quem | whom |
||||
nas | em + as |
||||
me | me |
||||
esse | that |
||||
eles | they |
||||
| estão from EST |
||||
você | you |
||||
| tinha from TER |
||||
| foram from SER |
||||
essa | that |
||||
num | em + um |
||||
nem | nor |
||||
suas | her |
||||
meu | my |
||||
às | a + as |
||||
minha | my |
||||
| têm from TER |
||||
numa | em + uma |
||||
pelos | per + os |
||||
elas | they |
||||
| havia from HAV |
||||
| seja from SER |
||||
qual | which |
||||
| será from SER |
||||
nós | we |
||||
| tenho from TER |
||||
lhe | to him, her |
||||
deles | of them |
||||
essas | those |
||||
esses | those |
||||
pelas | per + as |
||||
este | this |
||||
| fosse from SER |
||||
dele | of him |
||||
|
||||
| other words. There are many contractions such as naquele = em+aquele, |
||||
| mo = me+o, but they are rare. |
||||
| Indefinite article plural forms are also rare. |
||||
|
||||
tu | thou |
||||
te | thee |
||||
vocês | you (plural) |
||||
vos | you |
||||
lhes | to them |
||||
meus | my |
||||
minhas |
||||
teu | thy |
||||
tua |
||||
teus |
||||
tuas |
||||
nosso | our |
||||
nossa |
||||
nossos |
||||
nossas |
||||
|
||||
dela | of her |
||||
delas | of them |
||||
|
||||
esta | this |
||||
estes | these |
||||
estas | these |
||||
aquele | that |
||||
aquela | that |
||||
aqueles | those |
||||
aquelas | those |
||||
isto | this |
||||
aquilo | that |
||||
|
||||
| forms of estar, to be (not including the infinitive): |
||||
estou |
||||
está |
||||
estamos |
||||
estão |
||||
estive |
||||
esteve |
||||
estivemos |
||||
estiveram |
||||
estava |
||||
estávamos |
||||
estavam |
||||
estivera |
||||
estivéramos |
||||
esteja |
||||
estejamos |
||||
estejam |
||||
estivesse |
||||
estivéssemos |
||||
estivessem |
||||
estiver |
||||
estivermos |
||||
estiverem |
||||
|
||||
| forms of haver, to have (not including the infinitive): |
||||
hei |
||||
há |
||||
havemos |
||||
hão |
||||
houve |
||||
houvemos |
||||
houveram |
||||
houvera |
||||
houvéramos |
||||
haja |
||||
hajamos |
||||
hajam |
||||
houvesse |
||||
houvéssemos |
||||
houvessem |
||||
houver |
||||
houvermos |
||||
houverem |
||||
houverei |
||||
houverá |
||||
houveremos |
||||
houverão |
||||
houveria |
||||
houveríamos |
||||
houveriam |
||||
|
||||
| forms of ser, to be (not including the infinitive): |
||||
sou |
||||
somos |
||||
são |
||||
era |
||||
éramos |
||||
eram |
||||
fui |
||||
foi |
||||
fomos |
||||
foram |
||||
fora |
||||
fôramos |
||||
seja |
||||
sejamos |
||||
sejam |
||||
fosse |
||||
fôssemos |
||||
fossem |
||||
for |
||||
formos |
||||
forem |
||||
serei |
||||
será |
||||
seremos |
||||
serão |
||||
seria |
||||
seríamos |
||||
seriam |
||||
|
||||
| forms of ter, to have (not including the infinitive): |
||||
tenho |
||||
tem |
||||
temos |
||||
tém |
||||
tinha |
||||
tínhamos |
||||
tinham |
||||
tive |
||||
teve |
||||
tivemos |
||||
tiveram |
||||
tivera |
||||
tivéramos |
||||
tenha |
||||
tenhamos |
||||
tenham |
||||
tivesse |
||||
tivéssemos |
||||
tivessem |
||||
tiver |
||||
tivermos |
||||
tiverem |
||||
terei |
||||
terá |
||||
teremos |
||||
terão |
||||
teria |
||||
teríamos |
||||
teriam |
@ -0,0 +1,241 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/russian/stop.txt |
||||
| This file is distributed under the BSD License. |
||||
| See http://snowball.tartarus.org/license.php |
||||
| Also see http://www.opensource.org/licenses/bsd-license.html |
||||
| - Encoding was converted to UTF-8. |
||||
| - This notice was added. |
||||
|
||||
| a russian stop word list. comments begin with vertical bar. each stop |
||||
| word is at the start of a line. |
||||
|
||||
| this is a ranked list (commonest to rarest) of stopwords derived from |
||||
| a large text sample. |
||||
|
||||
| letter `ё' is translated to `е'. |
||||
|
||||
и | and |
||||
в | in/into |
||||
во | alternative form |
||||
не | not |
||||
что | what/that |
||||
он | he |
||||
на | on/onto |
||||
я | i |
||||
с | from |
||||
со | alternative form |
||||
как | how |
||||
а | milder form of `no' (but) |
||||
то | conjunction and form of `that' |
||||
все | all |
||||
она | she |
||||
так | so, thus |
||||
его | him |
||||
но | but |
||||
да | yes/and |
||||
ты | thou |
||||
к | towards, by |
||||
у | around, chez |
||||
же | intensifier particle |
||||
вы | you |
||||
за | beyond, behind |
||||
бы | conditional/subj. particle |
||||
по | up to, along |
||||
только | only |
||||
ее | her |
||||
мне | to me |
||||
было | it was |
||||
вот | here is/are, particle |
||||
от | away from |
||||
меня | me |
||||
еще | still, yet, more |
||||
нет | no, there isnt/arent |
||||
о | about |
||||
из | out of |
||||
ему | to him |
||||
теперь | now |
||||
когда | when |
||||
даже | even |
||||
ну | so, well |
||||
вдруг | suddenly |
||||
ли | interrogative particle |
||||
если | if |
||||
уже | already, but homonym of `narrower' |
||||
или | or |
||||
ни | neither |
||||
быть | to be |
||||
был | he was |
||||
него | prepositional form of его |
||||
до | up to |
||||
вас | you accusative |
||||
нибудь | indef. suffix preceded by hyphen |
||||
опять | again |
||||
уж | already, but homonym of `adder' |
||||
вам | to you |
||||
сказал | he said |
||||
ведь | particle `after all' |
||||
там | there |
||||
потом | then |
||||
себя | oneself |
||||
ничего | nothing |
||||
ей | to her |
||||
может | usually with `быть' as `maybe' |
||||
они | they |
||||
тут | here |
||||
где | where |
||||
есть | there is/are |
||||
надо | got to, must |
||||
ней | prepositional form of ей |
||||
для | for |
||||
мы | we |
||||
тебя | thee |
||||
их | them, their |
||||
чем | than |
||||
была | she was |
||||
сам | self |
||||
чтоб | in order to |
||||
без | without |
||||
будто | as if |
||||
человек | man, person, one |
||||
чего | genitive form of `what' |
||||
раз | once |
||||
тоже | also |
||||
себе | to oneself |
||||
под | beneath |
||||
жизнь | life |
||||
будет | will be |
||||
ж | short form of intensifer particle `же' |
||||
тогда | then |
||||
кто | who |
||||
этот | this |
||||
говорил | was saying |
||||
того | genitive form of `that' |
||||
потому | for that reason |
||||
этого | genitive form of `this' |
||||
какой | which |
||||
совсем | altogether |
||||
ним | prepositional form of `его', `они' |
||||
здесь | here |
||||
этом | prepositional form of `этот' |
||||
один | one |
||||
почти | almost |
||||
мой | my |
||||
тем | instrumental/dative plural of `тот', `то' |
||||
чтобы | full form of `in order that' |
||||
нее | her (acc.) |
||||
кажется | it seems |
||||
сейчас | now |
||||
были | they were |
||||
куда | where to |
||||
зачем | why |
||||
сказать | to say |
||||
всех | all (acc., gen. preposn. plural) |
||||
никогда | never |
||||
сегодня | today |
||||
можно | possible, one can |
||||
при | by |
||||
наконец | finally |
||||
два | two |
||||
об | alternative form of `о', about |
||||
другой | another |
||||
хоть | even |
||||
после | after |
||||
над | above |
||||
больше | more |
||||
тот | that one (masc.) |
||||
через | across, in |
||||
эти | these |
||||
нас | us |
||||
про | about |
||||
всего | in all, only, of all |
||||
них | prepositional form of `они' (they) |
||||
какая | which, feminine |
||||
много | lots |
||||
разве | interrogative particle |
||||
сказала | she said |
||||
три | three |
||||
эту | this, acc. fem. sing. |
||||
моя | my, feminine |
||||
впрочем | moreover, besides |
||||
хорошо | good |
||||
свою | ones own, acc. fem. sing. |
||||
этой | oblique form of `эта', fem. `this' |
||||
перед | in front of |
||||
иногда | sometimes |
||||
лучше | better |
||||
чуть | a little |
||||
том | preposn. form of `that one' |
||||
нельзя | one must not |
||||
такой | such a one |
||||
им | to them |
||||
более | more |
||||
всегда | always |
||||
конечно | of course |
||||
всю | acc. fem. sing of `all' |
||||
между | between |
||||
|
||||
|
||||
| b: some paradigms |
||||
| |
||||
| personal pronouns |
||||
| |
||||
| я меня мне мной [мною] |
||||
| ты тебя тебе тобой [тобою] |
||||
| он его ему им [него, нему, ним] |
||||
| она ее эи ею [нее, нэи, нею] |
||||
| оно его ему им [него, нему, ним] |
||||
| |
||||
| мы нас нам нами |
||||
| вы вас вам вами |
||||
| они их им ими [них, ним, ними] |
||||
| |
||||
| себя себе собой [собою] |
||||
| |
||||
| demonstrative pronouns: этот (this), тот (that) |
||||
| |
||||
| этот эта это эти |
||||
| этого эты это эти |
||||
| этого этой этого этих |
||||
| этому этой этому этим |
||||
| этим этой этим [этою] этими |
||||
| этом этой этом этих |
||||
| |
||||
| тот та то те |
||||
| того ту то те |
||||
| того той того тех |
||||
| тому той тому тем |
||||
| тем той тем [тою] теми |
||||
| том той том тех |
||||
| |
||||
| determinative pronouns |
||||
| |
||||
| (a) весь (all) |
||||
| |
||||
| весь вся все все |
||||
| всего всю все все |
||||
| всего всей всего всех |
||||
| всему всей всему всем |
||||
| всем всей всем [всею] всеми |
||||
| всем всей всем всех |
||||
| |
||||
| (b) сам (himself etc) |
||||
| |
||||
| сам сама само сами |
||||
| самого саму само самих |
||||
| самого самой самого самих |
||||
| самому самой самому самим |
||||
| самим самой самим [самою] самими |
||||
| самом самой самом самих |
||||
| |
||||
| stems of verbs `to be', `to have', `to do' and modal |
||||
| |
||||
| быть бы буд быв есть суть |
||||
| име |
||||
| дел |
||||
| мог мож мочь |
||||
| уме |
||||
| хоч хот |
||||
| долж |
||||
| можн |
||||
| нужн |
||||
| нельзя |
||||
|
@ -0,0 +1,354 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt |
||||
| This file is distributed under the BSD License. |
||||
| See http://snowball.tartarus.org/license.php |
||||
| Also see http://www.opensource.org/licenses/bsd-license.html |
||||
| - Encoding was converted to UTF-8. |
||||
| - This notice was added. |
||||
|
||||
| A Spanish stop word list. Comments begin with vertical bar. Each stop |
||||
| word is at the start of a line. |
||||
|
||||
|
||||
| The following is a ranked list (commonest to rarest) of stopwords |
||||
| deriving from a large sample of text. |
||||
|
||||
| Extra words have been added at the end. |
||||
|
||||
de | from, of |
||||
la | the, her |
||||
que | who, that |
||||
el | the |
||||
en | in |
||||
y | and |
||||
a | to |
||||
los | the, them |
||||
del | de + el |
||||
se | himself, from him etc |
||||
las | the, them |
||||
por | for, by, etc |
||||
un | a |
||||
para | for |
||||
con | with |
||||
no | no |
||||
una | a |
||||
su | his, her |
||||
al | a + el |
||||
| es from SER |
||||
lo | him |
||||
como | how |
||||
más | more |
||||
pero | pero |
||||
sus | su plural |
||||
le | to him, her |
||||
ya | already |
||||
o | or |
||||
| fue from SER |
||||
este | this |
||||
| ha from HABER |
||||
sí | himself etc |
||||
porque | because |
||||
esta | this |
||||
| son from SER |
||||
entre | between |
||||
| está from ESTAR |
||||
cuando | when |
||||
muy | very |
||||
sin | without |
||||
sobre | on |
||||
| ser from SER |
||||
| tiene from TENER |
||||
también | also |
||||
me | me |
||||
hasta | until |
||||
hay | there is/are |
||||
donde | where |
||||
| han from HABER |
||||
quien | whom, that |
||||
| están from ESTAR |
||||
| estado from ESTAR |
||||
desde | from |
||||
todo | all |
||||
nos | us |
||||
durante | during |
||||
| estados from ESTAR |
||||
todos | all |
||||
uno | a |
||||
les | to them |
||||
ni | nor |
||||
contra | against |
||||
otros | other |
||||
| fueron from SER |
||||
ese | that |
||||
eso | that |
||||
| había from HABER |
||||
ante | before |
||||
ellos | they |
||||
e | and (variant of y) |
||||
esto | this |
||||
mí | me |
||||
antes | before |
||||
algunos | some |
||||
qué | what? |
||||
unos | a |
||||
yo | I |
||||
otro | other |
||||
otras | other |
||||
otra | other |
||||
él | he |
||||
tanto | so much, many |
||||
esa | that |
||||
estos | these |
||||
mucho | much, many |
||||
quienes | who |
||||
nada | nothing |
||||
muchos | many |
||||
cual | who |
||||
| sea from SER |
||||
poco | few |
||||
ella | she |
||||
estar | to be |
||||
| haber from HABER |
||||
estas | these |
||||
| estaba from ESTAR |
||||
| estamos from ESTAR |
||||
algunas | some |
||||
algo | something |
||||
nosotros | we |
||||
|
||||
| other forms |
||||
|
||||
mi | me |
||||
mis | mi plural |
||||
tú | thou |
||||
te | thee |
||||
ti | thee |
||||
tu | thy |
||||
tus | tu plural |
||||
ellas | they |
||||
nosotras | we |
||||
vosotros | you |
||||
vosotras | you |
||||
os | you |
||||
mío | mine |
||||
mía | |
||||
míos | |
||||
mías | |
||||
tuyo | thine |
||||
tuya | |
||||
tuyos | |
||||
tuyas | |
||||
suyo | his, hers, theirs |
||||
suya | |
||||
suyos | |
||||
suyas | |
||||
nuestro | ours |
||||
nuestra | |
||||
nuestros | |
||||
nuestras | |
||||
vuestro | yours |
||||
vuestra | |
||||
vuestros | |
||||
vuestras | |
||||
esos | those |
||||
esas | those |
||||
|
||||
| forms of estar, to be (not including the infinitive): |
||||
estoy |
||||
estás |
||||
está |
||||
estamos |
||||
estáis |
||||
están |
||||
esté |
||||
estés |
||||
estemos |
||||
estéis |
||||
estén |
||||
estaré |
||||
estarás |
||||
estará |
||||
estaremos |
||||
estaréis |
||||
estarán |
||||
estaría |
||||
estarías |
||||
estaríamos |
||||
estaríais |
||||
estarían |
||||
estaba |
||||
estabas |
||||
estábamos |
||||
estabais |
||||
estaban |
||||
estuve |
||||
estuviste |
||||
estuvo |
||||
estuvimos |
||||
estuvisteis |
||||
estuvieron |
||||
estuviera |
||||
estuvieras |
||||
estuviéramos |
||||
estuvierais |
||||
estuvieran |
||||
estuviese |
||||
estuvieses |
||||
estuviésemos |
||||
estuvieseis |
||||
estuviesen |
||||
estando |
||||
estado |
||||
estada |
||||
estados |
||||
estadas |
||||
estad |
||||
|
||||
| forms of haber, to have (not including the infinitive): |
||||
he |
||||
has |
||||
ha |
||||
hemos |
||||
habéis |
||||
han |
||||
haya |
||||
hayas |
||||
hayamos |
||||
hayáis |
||||
hayan |
||||
habré |
||||
habrás |
||||
habrá |
||||
habremos |
||||
habréis |
||||
habrán |
||||
habría |
||||
habrías |
||||
habríamos |
||||
habríais |
||||
habrían |
||||
había |
||||
habías |
||||
habíamos |
||||
habíais |
||||
habían |
||||
hube |
||||
hubiste |
||||
hubo |
||||
hubimos |
||||
hubisteis |
||||
hubieron |
||||
hubiera |
||||
hubieras |
||||
hubiéramos |
||||
hubierais |
||||
hubieran |
||||
hubiese |
||||
hubieses |
||||
hubiésemos |
||||
hubieseis |
||||
hubiesen |
||||
habiendo |
||||
habido |
||||
habida |
||||
habidos |
||||
habidas |
||||
|
||||
| forms of ser, to be (not including the infinitive): |
||||
soy |
||||
eres |
||||
es |
||||
somos |
||||
sois |
||||
son |
||||
sea |
||||
seas |
||||
seamos |
||||
seáis |
||||
sean |
||||
seré |
||||
serás |
||||
será |
||||
seremos |
||||
seréis |
||||
serán |
||||
sería |
||||
serías |
||||
seríamos |
||||
seríais |
||||
serían |
||||
era |
||||
eras |
||||
éramos |
||||
erais |
||||
eran |
||||
fui |
||||
fuiste |
||||
fue |
||||
fuimos |
||||
fuisteis |
||||
fueron |
||||
fuera |
||||
fueras |
||||
fuéramos |
||||
fuerais |
||||
fueran |
||||
fuese |
||||
fueses |
||||
fuésemos |
||||
fueseis |
||||
fuesen |
||||
siendo |
||||
sido |
||||
| sed also means 'thirst' |
||||
|
||||
| forms of tener, to have (not including the infinitive): |
||||
tengo |
||||
tienes |
||||
tiene |
||||
tenemos |
||||
tenéis |
||||
tienen |
||||
tenga |
||||
tengas |
||||
tengamos |
||||
tengáis |
||||
tengan |
||||
tendré |
||||
tendrás |
||||
tendrá |
||||
tendremos |
||||
tendréis |
||||
tendrán |
||||
tendría |
||||
tendrías |
||||
tendríamos |
||||
tendríais |
||||
tendrían |
||||
tenía |
||||
tenías |
||||
teníamos |
||||
teníais |
||||
tenían |
||||
tuve |
||||
tuviste |
||||
tuvo |
||||
tuvimos |
||||
tuvisteis |
||||
tuvieron |
||||
tuviera |
||||
tuvieras |
||||
tuviéramos |
||||
tuvierais |
||||
tuvieran |
||||
tuviese |
||||
tuvieses |
||||
tuviésemos |
||||
tuvieseis |
||||
tuviesen |
||||
teniendo |
||||
tenido |
||||
tenida |
||||
tenidos |
||||
tenidas |
||||
tened |
||||
|
@ -0,0 +1,131 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt |
||||
| This file is distributed under the BSD License. |
||||
| See http://snowball.tartarus.org/license.php |
||||
| Also see http://www.opensource.org/licenses/bsd-license.html |
||||
| - Encoding was converted to UTF-8. |
||||
| - This notice was added. |
||||
|
||||
| A Swedish stop word list. Comments begin with vertical bar. Each stop |
||||
| word is at the start of a line. |
||||
|
||||
| This is a ranked list (commonest to rarest) of stopwords derived from |
||||
| a large text sample. |
||||
|
||||
| Swedish stop words occasionally exhibit homonym clashes. For example |
||||
| så = so, but also seed. These are indicated clearly below. |
||||
|
||||
och | and |
||||
det | it, this/that |
||||
att | to (with infinitive) |
||||
i | in, at |
||||
en | a |
||||
jag | I |
||||
hon | she |
||||
som | who, that |
||||
han | he |
||||
på | on |
||||
den | it, this/that |
||||
med | with |
||||
var | where, each |
||||
sig | him(self) etc |
||||
för | for |
||||
så | so (also: seed) |
||||
till | to |
||||
är | is |
||||
men | but |
||||
ett | a |
||||
om | if; around, about |
||||
hade | had |
||||
de | they, these/those |
||||
av | of |
||||
icke | not, no |
||||
mig | me |
||||
du | you |
||||
henne | her |
||||
då | then, when |
||||
sin | his |
||||
nu | now |
||||
har | have |
||||
inte | inte någon = no one |
||||
hans | his |
||||
honom | him |
||||
skulle | 'sake' |
||||
hennes | her |
||||
där | there |
||||
min | my |
||||
man | one (pronoun) |
||||
ej | nor |
||||
vid | at, by, on (also: vast) |
||||
kunde | could |
||||
något | some etc |
||||
från | from, off |
||||
ut | out |
||||
när | when |
||||
efter | after, behind |
||||
upp | up |
||||
vi | we |
||||
dem | them |
||||
vara | be |
||||
vad | what |
||||
över | over |
||||
än | than |
||||
dig | you |
||||
kan | can |
||||
sina | his |
||||
här | here |
||||
ha | have |
||||
mot | towards |
||||
alla | all |
||||
under | under (also: wonder) |
||||
någon | some etc |
||||
eller | or (else) |
||||
allt | all |
||||
mycket | much |
||||
sedan | since |
||||
ju | why |
||||
denna | this/that |
||||
själv | myself, yourself etc |
||||
detta | this/that |
||||
åt | to |
||||
utan | without |
||||
varit | was |
||||
hur | how |
||||
ingen | no |
||||
mitt | my |
||||
ni | you |
||||
bli | to be, become |
||||
blev | from bli |
||||
oss | us |
||||
din | thy |
||||
dessa | these/those |
||||
några | some etc |
||||
deras | their |
||||
blir | from bli |
||||
mina | my |
||||
samma | (the) same |
||||
vilken | who, that |
||||
er | you, your |
||||
sådan | such a |
||||
vår | our |
||||
blivit | from bli |
||||
dess | its |
||||
inom | within |
||||
mellan | between |
||||
sådant | such a |
||||
varför | why |
||||
varje | each |
||||
vilka | who, that |
||||
ditt | thy |
||||
vem | who |
||||
vilket | who, that |
||||
sitta | his |
||||
sådana | such a |
||||
vart | each |
||||
dina | thy |
||||
vars | whose |
||||
vårt | our |
||||
våra | our |
||||
ert | your |
||||
era | your |
||||
vilkas | whose |
||||
|
@ -0,0 +1,212 @@
|
||||
# Turkish stopwords from LUCENE-559 |
||||
# merged with the list from "Information Retrieval on Turkish Texts" |
||||
# (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf) |
||||
acaba |
||||
altmış |
||||
altı |
||||
ama |
||||
ancak |
||||
arada |
||||
aslında |
||||
ayrıca |
||||
bana |
||||
bazı |
||||
belki |
||||
ben |
||||
benden |
||||
beni |
||||
benim |
||||
beri |
||||
beş |
||||
bile |
||||
bin |
||||
bir |
||||
birçok |
||||
biri |
||||
birkaç |
||||
birkez |
||||
birşey |
||||
birşeyi |
||||
biz |
||||
bize |
||||
bizden |
||||
bizi |
||||
bizim |
||||
böyle |
||||
böylece |
||||
bu |
||||
buna |
||||
bunda |
||||
bundan |
||||
bunlar |
||||
bunları |
||||
bunların |
||||
bunu |
||||
bunun |
||||
burada |
||||
çok |
||||
çünkü |
||||
da |
||||
daha |
||||
dahi |
||||
de |
||||
defa |
||||
değil |
||||
diğer |
||||
diye |
||||
doksan |
||||
dokuz |
||||
dolayı |
||||
dolayısıyla |
||||
dört |
||||
edecek |
||||
eden |
||||
ederek |
||||
edilecek |
||||
ediliyor |
||||
edilmesi |
||||
ediyor |
||||
eğer |
||||
elli |
||||
en |
||||
etmesi |
||||
etti |
||||
ettiği |
||||
ettiğini |
||||
gibi |
||||
göre |
||||
halen |
||||
hangi |
||||
hatta |
||||
hem |
||||
henüz |
||||
hep |
||||
hepsi |
||||
her |
||||
herhangi |
||||
herkesin |
||||
hiç |
||||
hiçbir |
||||
için |
||||
iki |
||||
ile |
||||
ilgili |
||||
ise |
||||
işte |
||||
itibaren |
||||
itibariyle |
||||
kadar |
||||
karşın |
||||
katrilyon |
||||
kendi |
||||
kendilerine |
||||
kendini |
||||
kendisi |
||||
kendisine |
||||
kendisini |
||||
kez |
||||
ki |
||||
kim |
||||
kimden |
||||
kime |
||||
kimi |
||||
kimse |
||||
kırk |
||||
milyar |
||||
milyon |
||||
mu |
||||
mü |
||||
mı |
||||
nasıl |
||||
ne |
||||
neden |
||||
nedenle |
||||
nerde |
||||
nerede |
||||
nereye |
||||
niye |
||||
niçin |
||||
o |
||||
olan |
||||
olarak |
||||
oldu |
||||
olduğu |
||||
olduğunu |
||||
olduklarını |
||||
olmadı |
||||
olmadığı |
||||
olmak |
||||
olması |
||||
olmayan |
||||
olmaz |
||||
olsa |
||||
olsun |
||||
olup |
||||
olur |
||||
olursa |
||||
oluyor |
||||
on |
||||
ona |
||||
ondan |
||||
onlar |
||||
onlardan |
||||
onları |
||||
onların |
||||
onu |
||||
onun |
||||
otuz |
||||
oysa |
||||
öyle |
||||
pek |
||||
rağmen |
||||
sadece |
||||
sanki |
||||
sekiz |
||||
seksen |
||||
sen |
||||
senden |
||||
seni |
||||
senin |
||||
siz |
||||
sizden |
||||
sizi |
||||
sizin |
||||
şey |
||||
şeyden |
||||
şeyi |
||||
şeyler |
||||
şöyle |
||||
şu |
||||
şuna |
||||
şunda |
||||
şundan |
||||
şunları |
||||
şunu |
||||
tarafından |
||||
trilyon |
||||
tüm |
||||
üç |
||||
üzere |
||||
var |
||||
vardı |
||||
ve |
||||
veya |
||||
ya |
||||
yani |
||||
yapacak |
||||
yapılan |
||||
yapılması |
||||
yapıyor |
||||
yapmak |
||||
yaptı |
||||
yaptığı |
||||
yaptığını |
||||
yaptıkları |
||||
yedi |
||||
yerine |
||||
yetmiş |
||||
yine |
||||
yirmi |
||||
yoksa |
||||
yüz |
||||
zaten |
@ -0,0 +1,29 @@
|
||||
package com.fr.third.org.apache.lucene; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
/** Lucene's package information, including version. **/ |
||||
public final class LucenePackage { |
||||
|
||||
private LucenePackage() {} // can't construct
|
||||
|
||||
/** Return Lucene's package, including version information. */ |
||||
public static Package get() { |
||||
return LucenePackage.class.getPackage(); |
||||
} |
||||
} |
@ -0,0 +1,393 @@
|
||||
package com.fr.third.org.apache.lucene.analysis; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import com.fr.third.org.apache.lucene.store.AlreadyClosedException; |
||||
import com.fr.third.org.apache.lucene.util.CloseableThreadLocal; |
||||
|
||||
import java.io.Closeable; |
||||
import java.io.IOException; |
||||
import java.io.Reader; |
||||
import java.util.HashMap; |
||||
import java.util.Map; |
||||
|
||||
/** |
||||
* An Analyzer builds TokenStreams, which analyze text. It thus represents a |
||||
* policy for extracting index terms from text. |
||||
* <p> |
||||
* In order to define what analysis is done, subclasses must define their |
||||
* {@link TokenStreamComponents TokenStreamComponents} in {@link #createComponents(String, Reader)}. |
||||
* The components are then reused in each call to {@link #tokenStream(String, Reader)}. |
||||
* <p> |
||||
* Simple example: |
||||
* <pre class="prettyprint"> |
||||
* Analyzer analyzer = new Analyzer() { |
||||
* {@literal @Override} |
||||
* protected TokenStreamComponents createComponents(String fieldName, Reader reader) { |
||||
* Tokenizer source = new FooTokenizer(reader); |
||||
* TokenStream filter = new FooFilter(source); |
||||
* filter = new BarFilter(filter); |
||||
* return new TokenStreamComponents(source, filter); |
||||
* } |
||||
* }; |
||||
* </pre> |
||||
* For more examples, see the {@link com.fr.third.org.apache.lucene.analysis Analysis package documentation}. |
||||
* <p> |
||||
* For some concrete implementations bundled with Lucene, look in the analysis modules: |
||||
* <ul> |
||||
* <li><a href="{@docRoot}/../analyzers-common/overview-summary.html">Common</a>: |
||||
* Analyzers for indexing content in different languages and domains. |
||||
* <li><a href="{@docRoot}/../analyzers-icu/overview-summary.html">ICU</a>: |
||||
* Exposes functionality from ICU to Apache Lucene. |
||||
* <li><a href="{@docRoot}/../analyzers-kuromoji/overview-summary.html">Kuromoji</a>: |
||||
* Morphological analyzer for Japanese text. |
||||
* <li><a href="{@docRoot}/../analyzers-morfologik/overview-summary.html">Morfologik</a>: |
||||
* Dictionary-driven lemmatization for the Polish language. |
||||
* <li><a href="{@docRoot}/../analyzers-phonetic/overview-summary.html">Phonetic</a>: |
||||
* Analysis for indexing phonetic signatures (for sounds-alike search). |
||||
* <li><a href="{@docRoot}/../analyzers-smartcn/overview-summary.html">Smart Chinese</a>: |
||||
* Analyzer for Simplified Chinese, which indexes words. |
||||
* <li><a href="{@docRoot}/../analyzers-stempel/overview-summary.html">Stempel</a>: |
||||
* Algorithmic Stemmer for the Polish Language. |
||||
* <li><a href="{@docRoot}/../analyzers-uima/overview-summary.html">UIMA</a>: |
||||
* Analysis integration with Apache UIMA. |
||||
* </ul> |
||||
*/ |
||||
public abstract class Analyzer implements Closeable { |
||||
|
||||
private final ReuseStrategy reuseStrategy; |
||||
|
||||
/** |
||||
* Create a new Analyzer, reusing the same set of components per-thread |
||||
* across calls to {@link #tokenStream(String, Reader)}. |
||||
*/ |
||||
public Analyzer() { |
||||
this(new GlobalReuseStrategy()); |
||||
} |
||||
|
||||
/** |
||||
* Expert: create a new Analyzer with a custom {@link ReuseStrategy}. |
||||
* <p> |
||||
* NOTE: if you just want to reuse on a per-field basis, its easier to |
||||
* use a subclass of {@link AnalyzerWrapper} such as |
||||
* <a href="{@docRoot}/../analyzers-common/org/apache/lucene/analysis/miscellaneous/PerFieldAnalyzerWrapper.html"> |
||||
* PerFieldAnalyerWrapper</a> instead. |
||||
*/ |
||||
public Analyzer(ReuseStrategy reuseStrategy) { |
||||
this.reuseStrategy = reuseStrategy; |
||||
} |
||||
|
||||
/** |
||||
* Creates a new {@link TokenStreamComponents} instance for this analyzer. |
||||
* |
||||
* @param fieldName |
||||
* the name of the fields content passed to the |
||||
* {@link TokenStreamComponents} sink as a reader |
||||
* @param reader |
||||
* the reader passed to the {@link Tokenizer} constructor |
||||
* @return the {@link TokenStreamComponents} for this analyzer. |
||||
*/ |
||||
protected abstract TokenStreamComponents createComponents(String fieldName, |
||||
Reader reader); |
||||
|
||||
/** |
||||
* Returns a TokenStream suitable for <code>fieldName</code>, tokenizing |
||||
* the contents of <code>reader</code>. |
||||
* <p> |
||||
* This method uses {@link #createComponents(String, Reader)} to obtain an |
||||
* instance of {@link TokenStreamComponents}. It returns the sink of the |
||||
* components and stores the components internally. Subsequent calls to this |
||||
* method will reuse the previously stored components after resetting them |
||||
* through {@link TokenStreamComponents#setReader(Reader)}. |
||||
* <p> |
||||
* <b>NOTE:</b> After calling this method, the consumer must follow the |
||||
* workflow described in {@link TokenStream} to properly consume its contents. |
||||
* See the {@link com.fr.third.org.apache.lucene.analysis Analysis package documentation} for |
||||
* some examples demonstrating this. |
||||
* |
||||
* @param fieldName the name of the field the created TokenStream is used for |
||||
* @param reader the reader the streams source reads from |
||||
* @return TokenStream for iterating the analyzed content of <code>reader</code> |
||||
* @throws AlreadyClosedException if the Analyzer is closed. |
||||
* @throws IOException if an i/o error occurs. |
||||
*/ |
||||
public final TokenStream tokenStream(final String fieldName, |
||||
final Reader reader) throws IOException { |
||||
TokenStreamComponents components = reuseStrategy.getReusableComponents(fieldName); |
||||
final Reader r = initReader(fieldName, reader); |
||||
if (components == null) { |
||||
components = createComponents(fieldName, r); |
||||
reuseStrategy.setReusableComponents(fieldName, components); |
||||
} else { |
||||
components.setReader(r); |
||||
} |
||||
return components.getTokenStream(); |
||||
} |
||||
|
||||
/** |
||||
* Override this if you want to add a CharFilter chain. |
||||
* <p> |
||||
* The default implementation returns <code>reader</code> |
||||
* unchanged. |
||||
* |
||||
* @param fieldName IndexableField name being indexed |
||||
* @param reader original Reader |
||||
* @return reader, optionally decorated with CharFilter(s) |
||||
*/ |
||||
protected Reader initReader(String fieldName, Reader reader) { |
||||
return reader; |
||||
} |
||||
|
||||
/** |
||||
* Invoked before indexing a IndexableField instance if |
||||
* terms have already been added to that field. This allows custom |
||||
* analyzers to place an automatic position increment gap between |
||||
* IndexbleField instances using the same field name. The default value |
||||
* position increment gap is 0. With a 0 position increment gap and |
||||
* the typical default token position increment of 1, all terms in a field, |
||||
* including across IndexableField instances, are in successive positions, allowing |
||||
* exact PhraseQuery matches, for instance, across IndexableField instance boundaries. |
||||
* |
||||
* @param fieldName IndexableField name being indexed. |
||||
* @return position increment gap, added to the next token emitted from {@link #tokenStream(String,Reader)}. |
||||
* This value must be {@code >= 0}. |
||||
*/ |
||||
public int getPositionIncrementGap(String fieldName) { |
||||
return 0; |
||||
} |
||||
|
||||
/** |
||||
* Just like {@link #getPositionIncrementGap}, except for |
||||
* Token offsets instead. By default this returns 1. |
||||
* This method is only called if the field |
||||
* produced at least one token for indexing. |
||||
* |
||||
* @param fieldName the field just indexed |
||||
* @return offset gap, added to the next token emitted from {@link #tokenStream(String,Reader)}. |
||||
* This value must be {@code >= 0}. |
||||
*/ |
||||
public int getOffsetGap(String fieldName) { |
||||
return 1; |
||||
} |
||||
|
||||
/** Frees persistent resources used by this Analyzer */ |
||||
public void close() { |
||||
reuseStrategy.close(); |
||||
} |
||||
|
||||
/** |
||||
* This class encapsulates the outer components of a token stream. It provides |
||||
* access to the source ({@link Tokenizer}) and the outer end (sink), an |
||||
* instance of {@link TokenFilter} which also serves as the |
||||
* {@link TokenStream} returned by |
||||
* {@link Analyzer#tokenStream(String, Reader)}. |
||||
*/ |
||||
public static class TokenStreamComponents { |
||||
/** |
||||
* Original source of the tokens. |
||||
*/ |
||||
protected final Tokenizer source; |
||||
/** |
||||
* Sink tokenstream, such as the outer tokenfilter decorating |
||||
* the chain. This can be the source if there are no filters. |
||||
*/ |
||||
protected final TokenStream sink; |
||||
|
||||
/** |
||||
* Creates a new {@link TokenStreamComponents} instance. |
||||
* |
||||
* @param source |
||||
* the analyzer's tokenizer |
||||
* @param result |
||||
* the analyzer's resulting token stream |
||||
*/ |
||||
public TokenStreamComponents(final Tokenizer source, |
||||
final TokenStream result) { |
||||
this.source = source; |
||||
this.sink = result; |
||||
} |
||||
|
||||
/** |
||||
* Creates a new {@link TokenStreamComponents} instance. |
||||
* |
||||
* @param source |
||||
* the analyzer's tokenizer |
||||
*/ |
||||
public TokenStreamComponents(final Tokenizer source) { |
||||
this.source = source; |
||||
this.sink = source; |
||||
} |
||||
|
||||
/** |
||||
* Resets the encapsulated components with the given reader. If the components |
||||
* cannot be reset, an Exception should be thrown. |
||||
* |
||||
* @param reader |
||||
* a reader to reset the source component |
||||
* @throws IOException |
||||
* if the component's reset method throws an {@link IOException} |
||||
*/ |
||||
protected void setReader(final Reader reader) throws IOException { |
||||
source.setReader(reader); |
||||
} |
||||
|
||||
/** |
||||
* Returns the sink {@link TokenStream} |
||||
* |
||||
* @return the sink {@link TokenStream} |
||||
*/ |
||||
public TokenStream getTokenStream() { |
||||
return sink; |
||||
} |
||||
|
||||
/** |
||||
* Returns the component's {@link Tokenizer} |
||||
* |
||||
* @return Component's {@link Tokenizer} |
||||
*/ |
||||
public Tokenizer getTokenizer() { |
||||
return source; |
||||
} |
||||
} |
||||
|
||||
/** |
||||
* Strategy defining how TokenStreamComponents are reused per call to |
||||
* {@link Analyzer#tokenStream(String, Reader)}. |
||||
*/ |
||||
public static abstract class ReuseStrategy implements Closeable { |
||||
|
||||
private CloseableThreadLocal<Object> storedValue = new CloseableThreadLocal<Object>(); |
||||
|
||||
/** Sole constructor. (For invocation by subclass constructors, typically implicit.) */ |
||||
public ReuseStrategy() {} |
||||
|
||||
/** |
||||
* Gets the reusable TokenStreamComponents for the field with the given name |
||||
* |
||||
* @param fieldName Name of the field whose reusable TokenStreamComponents |
||||
* are to be retrieved |
||||
* @return Reusable TokenStreamComponents for the field, or {@code null} |
||||
* if there was no previous components for the field |
||||
*/ |
||||
public abstract TokenStreamComponents getReusableComponents(String fieldName); |
||||
|
||||
/** |
||||
* Stores the given TokenStreamComponents as the reusable components for the |
||||
* field with the give name |
||||
* |
||||
* @param fieldName Name of the field whose TokenStreamComponents are being set |
||||
* @param components TokenStreamComponents which are to be reused for the field |
||||
*/ |
||||
public abstract void setReusableComponents(String fieldName, TokenStreamComponents components); |
||||
|
||||
/** |
||||
* Returns the currently stored value |
||||
* |
||||
* @return Currently stored value or {@code null} if no value is stored |
||||
* @throws AlreadyClosedException if the ReuseStrategy is closed. |
||||
*/ |
||||
protected final Object getStoredValue() { |
||||
try { |
||||
return storedValue.get(); |
||||
} catch (NullPointerException npe) { |
||||
if (storedValue == null) { |
||||
throw new AlreadyClosedException("this Analyzer is closed"); |
||||
} else { |
||||
throw npe; |
||||
} |
||||
} |
||||
} |
||||
|
||||
/** |
||||
* Sets the stored value |
||||
* |
||||
* @param storedValue Value to store |
||||
* @throws AlreadyClosedException if the ReuseStrategy is closed. |
||||
*/ |
||||
protected final void setStoredValue(Object storedValue) { |
||||
try { |
||||
this.storedValue.set(storedValue); |
||||
} catch (NullPointerException npe) { |
||||
if (storedValue == null) { |
||||
throw new AlreadyClosedException("this Analyzer is closed"); |
||||
} else { |
||||
throw npe; |
||||
} |
||||
} |
||||
} |
||||
|
||||
/** |
||||
* Closes the ReuseStrategy, freeing any resources |
||||
*/ |
||||
public void close() { |
||||
if (storedValue != null) { |
||||
storedValue.close(); |
||||
storedValue = null; |
||||
} |
||||
} |
||||
} |
||||
|
||||
/** |
||||
* Implementation of {@link ReuseStrategy} that reuses the same components for |
||||
* every field. |
||||
*/ |
||||
public final static class GlobalReuseStrategy extends ReuseStrategy { |
||||
|
||||
/** Creates a new instance, with empty per-thread values */ |
||||
public GlobalReuseStrategy() {} |
||||
|
||||
@Override |
||||
public TokenStreamComponents getReusableComponents(String fieldName) { |
||||
return (TokenStreamComponents) getStoredValue(); |
||||
} |
||||
|
||||
@Override |
||||
public void setReusableComponents(String fieldName, TokenStreamComponents components) { |
||||
setStoredValue(components); |
||||
} |
||||
} |
||||
|
||||
/** |
||||
* Implementation of {@link ReuseStrategy} that reuses components per-field by |
||||
* maintaining a Map of TokenStreamComponent per field name. |
||||
*/ |
||||
public static class PerFieldReuseStrategy extends ReuseStrategy { |
||||
|
||||
/** Creates a new instance, with empty per-thread-per-field values */ |
||||
public PerFieldReuseStrategy() {} |
||||
|
||||
@SuppressWarnings("unchecked") |
||||
@Override |
||||
public TokenStreamComponents getReusableComponents(String fieldName) { |
||||
Map<String, TokenStreamComponents> componentsPerField = (Map<String, TokenStreamComponents>) getStoredValue(); |
||||
return componentsPerField != null ? componentsPerField.get(fieldName) : null; |
||||
} |
||||
|
||||
@SuppressWarnings("unchecked") |
||||
@Override |
||||
public void setReusableComponents(String fieldName, TokenStreamComponents components) { |
||||
Map<String, TokenStreamComponents> componentsPerField = (Map<String, TokenStreamComponents>) getStoredValue(); |
||||
if (componentsPerField == null) { |
||||
componentsPerField = new HashMap<String, TokenStreamComponents>(); |
||||
setStoredValue(componentsPerField); |
||||
} |
||||
componentsPerField.put(fieldName, components); |
||||
} |
||||
} |
||||
|
||||
} |
@ -0,0 +1,83 @@
|
||||
package com.fr.third.org.apache.lucene.analysis; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.Reader; |
||||
|
||||
/** |
||||
* Extension to {@link Analyzer} suitable for Analyzers which wrap |
||||
* other Analyzers. |
||||
* <p/> |
||||
* {@link #getWrappedAnalyzer(String)} allows the Analyzer |
||||
* to wrap multiple Analyzers which are selected on a per field basis. |
||||
* <p/> |
||||
* {@link #wrapComponents(String, TokenStreamComponents)} allows the |
||||
* TokenStreamComponents of the wrapped Analyzer to then be wrapped |
||||
* (such as adding a new {@link TokenFilter} to form new TokenStreamComponents. |
||||
*/ |
||||
public abstract class AnalyzerWrapper extends Analyzer { |
||||
|
||||
/** |
||||
* Creates a new AnalyzerWrapper. Since the {@link ReuseStrategy} of |
||||
* the wrapped Analyzers are unknown, {@link PerFieldReuseStrategy} is assumed |
||||
*/ |
||||
protected AnalyzerWrapper() { |
||||
super(new PerFieldReuseStrategy()); |
||||
} |
||||
|
||||
/** |
||||
* Retrieves the wrapped Analyzer appropriate for analyzing the field with |
||||
* the given name |
||||
* |
||||
* @param fieldName Name of the field which is to be analyzed |
||||
* @return Analyzer for the field with the given name. Assumed to be non-null |
||||
*/ |
||||
protected abstract Analyzer getWrappedAnalyzer(String fieldName); |
||||
|
||||
/** |
||||
* Wraps / alters the given TokenStreamComponents, taken from the wrapped |
||||
* Analyzer, to form new components. It is through this method that new |
||||
* TokenFilters can be added by AnalyzerWrappers. |
||||
* |
||||
* |
||||
* @param fieldName Name of the field which is to be analyzed |
||||
* @param components TokenStreamComponents taken from the wrapped Analyzer |
||||
* @return Wrapped / altered TokenStreamComponents. |
||||
*/ |
||||
protected abstract TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components); |
||||
|
||||
@Override |
||||
protected final TokenStreamComponents createComponents(String fieldName, Reader aReader) { |
||||
return wrapComponents(fieldName, getWrappedAnalyzer(fieldName).createComponents(fieldName, aReader)); |
||||
} |
||||
|
||||
@Override |
||||
public final int getPositionIncrementGap(String fieldName) { |
||||
return getWrappedAnalyzer(fieldName).getPositionIncrementGap(fieldName); |
||||
} |
||||
|
||||
@Override |
||||
public final int getOffsetGap(String fieldName) { |
||||
return getWrappedAnalyzer(fieldName).getOffsetGap(fieldName); |
||||
} |
||||
|
||||
@Override |
||||
public final Reader initReader(String fieldName, Reader reader) { |
||||
return getWrappedAnalyzer(fieldName).initReader(fieldName, reader); |
||||
} |
||||
} |
@ -0,0 +1,98 @@
|
||||
package com.fr.third.org.apache.lucene.analysis; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.IOException; |
||||
import java.util.Iterator; |
||||
import java.util.LinkedList; |
||||
import java.util.List; |
||||
|
||||
import com.fr.third.org.apache.lucene.util.AttributeSource; |
||||
|
||||
/** |
||||
* This class can be used if the token attributes of a TokenStream |
||||
* are intended to be consumed more than once. It caches |
||||
* all token attribute states locally in a List. |
||||
* |
||||
* <P>CachingTokenFilter implements the optional method |
||||
* {@link TokenStream#reset()}, which repositions the |
||||
* stream to the first Token. |
||||
*/ |
||||
public final class CachingTokenFilter extends TokenFilter { |
||||
private List<State> cache = null; |
||||
private Iterator<State> iterator = null; |
||||
private State finalState; |
||||
|
||||
/** |
||||
* Create a new CachingTokenFilter around <code>input</code>, |
||||
* caching its token attributes, which can be replayed again |
||||
* after a call to {@link #reset()}. |
||||
*/ |
||||
public CachingTokenFilter(TokenStream input) { |
||||
super(input); |
||||
} |
||||
|
||||
@Override |
||||
public final boolean incrementToken() throws IOException { |
||||
if (cache == null) { |
||||
// fill cache lazily
|
||||
cache = new LinkedList<State>(); |
||||
fillCache(); |
||||
iterator = cache.iterator(); |
||||
} |
||||
|
||||
if (!iterator.hasNext()) { |
||||
// the cache is exhausted, return false
|
||||
return false; |
||||
} |
||||
// Since the TokenFilter can be reset, the tokens need to be preserved as immutable.
|
||||
restoreState(iterator.next()); |
||||
return true; |
||||
} |
||||
|
||||
@Override |
||||
public final void end() { |
||||
if (finalState != null) { |
||||
restoreState(finalState); |
||||
} |
||||
} |
||||
|
||||
/** |
||||
* Rewinds the iterator to the beginning of the cached list. |
||||
* <p> |
||||
* Note that this does not call reset() on the wrapped tokenstream ever, even |
||||
* the first time. You should reset() the inner tokenstream before wrapping |
||||
* it with CachingTokenFilter. |
||||
*/ |
||||
@Override |
||||
public void reset() { |
||||
if(cache != null) { |
||||
iterator = cache.iterator(); |
||||
} |
||||
} |
||||
|
||||
private void fillCache() throws IOException { |
||||
while(input.incrementToken()) { |
||||
cache.add(captureState()); |
||||
} |
||||
// capture final state
|
||||
input.end(); |
||||
finalState = captureState(); |
||||
} |
||||
|
||||
} |
@ -0,0 +1,84 @@
|
||||
package com.fr.third.org.apache.lucene.analysis; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.IOException; |
||||
import java.io.Reader; |
||||
|
||||
/** |
||||
* Subclasses of CharFilter can be chained to filter a Reader |
||||
* They can be used as {@link Reader} with additional offset |
||||
* correction. {@link Tokenizer}s will automatically use {@link #correctOffset} |
||||
* if a CharFilter subclass is used. |
||||
* <p> |
||||
* This class is abstract: at a minimum you must implement {@link #read(char[], int, int)}, |
||||
* transforming the input in some way from {@link #input}, and {@link #correct(int)} |
||||
* to adjust the offsets to match the originals. |
||||
* <p> |
||||
* You can optionally provide more efficient implementations of additional methods |
||||
* like {@link #read()}, {@link #read(char[])}, {@link #read(java.nio.CharBuffer)}, |
||||
* but this is not required. |
||||
* <p> |
||||
* For examples and integration with {@link Analyzer}, see the |
||||
* {@link com.fr.third.org.apache.lucene.analysis Analysis package documentation}. |
||||
*/ |
||||
// the way java.io.FilterReader should work!
|
||||
public abstract class CharFilter extends Reader { |
||||
/** |
||||
* The underlying character-input stream. |
||||
*/ |
||||
protected final Reader input; |
||||
|
||||
/** |
||||
* Create a new CharFilter wrapping the provided reader. |
||||
* @param input a Reader, can also be a CharFilter for chaining. |
||||
*/ |
||||
public CharFilter(Reader input) { |
||||
super(input); |
||||
this.input = input; |
||||
} |
||||
|
||||
/** |
||||
* Closes the underlying input stream. |
||||
* <p> |
||||
* <b>NOTE:</b> |
||||
* The default implementation closes the input Reader, so |
||||
* be sure to call <code>super.close()</code> when overriding this method. |
||||
*/ |
||||
@Override |
||||
public void close() throws IOException { |
||||
input.close(); |
||||
} |
||||
|
||||
/** |
||||
* Subclasses override to correct the current offset. |
||||
* |
||||
* @param currentOff current offset |
||||
* @return corrected offset |
||||
*/ |
||||
protected abstract int correct(int currentOff); |
||||
|
||||
/** |
||||
* Chains the corrected offset through the input |
||||
* CharFilter(s). |
||||
*/ |
||||
public final int correctOffset(int currentOff) { |
||||
final int corrected = correct(currentOff); |
||||
return (input instanceof CharFilter) ? ((CharFilter) input).correctOffset(corrected) : corrected; |
||||
} |
||||
} |
@ -0,0 +1,321 @@
|
||||
package com.fr.third.org.apache.lucene.analysis; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
||||
import com.fr.third.org.apache.lucene.document.DoubleField; // for javadocs
|
||||
import com.fr.third.org.apache.lucene.document.FloatField; // for javadocs
|
||||
import com.fr.third.org.apache.lucene.document.IntField; // for javadocs
|
||||
import com.fr.third.org.apache.lucene.document.LongField; // for javadocs
|
||||
import com.fr.third.org.apache.lucene.search.NumericRangeFilter; // for javadocs
|
||||
import com.fr.third.org.apache.lucene.search.NumericRangeQuery; |
||||
import com.fr.third.org.apache.lucene.util.Attribute; |
||||
import com.fr.third.org.apache.lucene.util.AttributeImpl; |
||||
import com.fr.third.org.apache.lucene.util.AttributeReflector; |
||||
import com.fr.third.org.apache.lucene.util.BytesRef; |
||||
import com.fr.third.org.apache.lucene.util.NumericUtils; |
||||
|
||||
/** |
||||
* <b>Expert:</b> This class provides a {@link TokenStream} |
||||
* for indexing numeric values that can be used by {@link |
||||
* NumericRangeQuery} or {@link NumericRangeFilter}. |
||||
* |
||||
* <p>Note that for simple usage, {@link IntField}, {@link |
||||
* LongField}, {@link FloatField} or {@link DoubleField} is |
||||
* recommended. These fields disable norms and |
||||
* term freqs, as they are not usually needed during |
||||
* searching. If you need to change these settings, you |
||||
* should use this class. |
||||
* |
||||
* <p>Here's an example usage, for an <code>int</code> field: |
||||
* |
||||
* <pre class="prettyprint"> |
||||
* FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED); |
||||
* fieldType.setOmitNorms(true); |
||||
* fieldType.setIndexOptions(IndexOptions.DOCS_ONLY); |
||||
* Field field = new Field(name, new NumericTokenStream(precisionStep).setIntValue(value), fieldType); |
||||
* document.add(field); |
||||
* </pre> |
||||
* |
||||
* <p>For optimal performance, re-use the TokenStream and Field instance |
||||
* for more than one document: |
||||
* |
||||
* <pre class="prettyprint"> |
||||
* NumericTokenStream stream = new NumericTokenStream(precisionStep); |
||||
* FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED); |
||||
* fieldType.setOmitNorms(true); |
||||
* fieldType.setIndexOptions(IndexOptions.DOCS_ONLY); |
||||
* Field field = new Field(name, stream, fieldType); |
||||
* Document document = new Document(); |
||||
* document.add(field); |
||||
* |
||||
* for(all documents) { |
||||
* stream.setIntValue(value) |
||||
* writer.addDocument(document); |
||||
* } |
||||
* </pre> |
||||
* |
||||
* <p>This stream is not intended to be used in analyzers; |
||||
* it's more for iterating the different precisions during |
||||
* indexing a specific numeric value.</p> |
||||
|
||||
* <p><b>NOTE</b>: as token streams are only consumed once |
||||
* the document is added to the index, if you index more |
||||
* than one numeric field, use a separate <code>NumericTokenStream</code> |
||||
* instance for each.</p> |
||||
* |
||||
* <p>See {@link NumericRangeQuery} for more details on the |
||||
* <a |
||||
* href="../search/NumericRangeQuery.html#precisionStepDesc"><code>precisionStep</code></a> |
||||
* parameter as well as how numeric fields work under the hood.</p> |
||||
* |
||||
* @since 2.9 |
||||
*/ |
||||
public final class NumericTokenStream extends TokenStream { |
||||
|
||||
/** The full precision token gets this token type assigned. */ |
||||
public static final String TOKEN_TYPE_FULL_PREC = "fullPrecNumeric"; |
||||
|
||||
/** The lower precision tokens gets this token type assigned. */ |
||||
public static final String TOKEN_TYPE_LOWER_PREC = "lowerPrecNumeric"; |
||||
|
||||
/** <b>Expert:</b> Use this attribute to get the details of the currently generated token. |
||||
* @lucene.experimental |
||||
* @since 4.0 |
||||
*/ |
||||
public interface NumericTermAttribute extends Attribute { |
||||
/** Returns current shift value, undefined before first token */ |
||||
int getShift(); |
||||
/** Returns current token's raw value as {@code long} with all {@link #getShift} applied, undefined before first token */ |
||||
long getRawValue(); |
||||
/** Returns value size in bits (32 for {@code float}, {@code int}; 64 for {@code double}, {@code long}) */ |
||||
int getValueSize(); |
||||
|
||||
/** <em>Don't call this method!</em> |
||||
* @lucene.internal */ |
||||
void init(long value, int valSize, int precisionStep, int shift); |
||||
|
||||
/** <em>Don't call this method!</em> |
||||
* @lucene.internal */ |
||||
void setShift(int shift); |
||||
|
||||
/** <em>Don't call this method!</em> |
||||
* @lucene.internal */ |
||||
int incShift(); |
||||
} |
||||
|
||||
// just a wrapper to prevent adding CTA
|
||||
private static final class NumericAttributeFactory extends AttributeFactory { |
||||
private final AttributeFactory delegate; |
||||
|
||||
NumericAttributeFactory(AttributeFactory delegate) { |
||||
this.delegate = delegate; |
||||
} |
||||
|
||||
@Override |
||||
public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) { |
||||
if (CharTermAttribute.class.isAssignableFrom(attClass)) |
||||
throw new IllegalArgumentException("NumericTokenStream does not support CharTermAttribute."); |
||||
return delegate.createAttributeInstance(attClass); |
||||
} |
||||
} |
||||
|
||||
/** Implementation of {@link NumericTermAttribute}. |
||||
* @lucene.internal |
||||
* @since 4.0 |
||||
*/ |
||||
public static final class NumericTermAttributeImpl extends AttributeImpl implements NumericTermAttribute,TermToBytesRefAttribute { |
||||
private long value = 0L; |
||||
private int valueSize = 0, shift = 0, precisionStep = 0; |
||||
private BytesRef bytes = new BytesRef(); |
||||
|
||||
/** |
||||
* Creates, but does not yet initialize this attribute instance |
||||
* @see #init(long, int, int, int) |
||||
*/ |
||||
public NumericTermAttributeImpl() {} |
||||
|
||||
public BytesRef getBytesRef() { |
||||
return bytes; |
||||
} |
||||
|
||||
public int fillBytesRef() { |
||||
try { |
||||
assert valueSize == 64 || valueSize == 32; |
||||
return (valueSize == 64) ? |
||||
NumericUtils.longToPrefixCoded(value, shift, bytes) : |
||||
NumericUtils.intToPrefixCoded((int) value, shift, bytes); |
||||
} catch (IllegalArgumentException iae) { |
||||
// return empty token before first or after last
|
||||
bytes.length = 0; |
||||
return 0; |
||||
} |
||||
} |
||||
|
||||
public int getShift() { return shift; } |
||||
public void setShift(int shift) { this.shift = shift; } |
||||
public int incShift() { |
||||
return (shift += precisionStep); |
||||
} |
||||
|
||||
public long getRawValue() { return value & ~((1L << shift) - 1L); } |
||||
public int getValueSize() { return valueSize; } |
||||
|
||||
public void init(long value, int valueSize, int precisionStep, int shift) { |
||||
this.value = value; |
||||
this.valueSize = valueSize; |
||||
this.precisionStep = precisionStep; |
||||
this.shift = shift; |
||||
} |
||||
|
||||
@Override |
||||
public void clear() { |
||||
// this attribute has no contents to clear!
|
||||
// we keep it untouched as it's fully controlled by outer class.
|
||||
} |
||||
|
||||
@Override |
||||
public void reflectWith(AttributeReflector reflector) { |
||||
fillBytesRef(); |
||||
reflector.reflect(TermToBytesRefAttribute.class, "bytes", BytesRef.deepCopyOf(bytes)); |
||||
reflector.reflect(NumericTermAttribute.class, "shift", shift); |
||||
reflector.reflect(NumericTermAttribute.class, "rawValue", getRawValue()); |
||||
reflector.reflect(NumericTermAttribute.class, "valueSize", valueSize); |
||||
} |
||||
|
||||
@Override |
||||
public void copyTo(AttributeImpl target) { |
||||
final NumericTermAttribute a = (NumericTermAttribute) target; |
||||
a.init(value, valueSize, precisionStep, shift); |
||||
} |
||||
} |
||||
|
||||
/** |
||||
* Creates a token stream for numeric values using the default <code>precisionStep</code> |
||||
* {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). The stream is not yet initialized, |
||||
* before using set a value using the various set<em>???</em>Value() methods. |
||||
*/ |
||||
public NumericTokenStream() { |
||||
this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, NumericUtils.PRECISION_STEP_DEFAULT); |
||||
} |
||||
|
||||
/** |
||||
* Creates a token stream for numeric values with the specified |
||||
* <code>precisionStep</code>. The stream is not yet initialized, |
||||
* before using set a value using the various set<em>???</em>Value() methods. |
||||
*/ |
||||
public NumericTokenStream(final int precisionStep) { |
||||
this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, precisionStep); |
||||
} |
||||
|
||||
/** |
||||
* Expert: Creates a token stream for numeric values with the specified |
||||
* <code>precisionStep</code> using the given |
||||
* {@link AttributeFactory}. |
||||
* The stream is not yet initialized, |
||||
* before using set a value using the various set<em>???</em>Value() methods. |
||||
*/ |
||||
public NumericTokenStream(AttributeFactory factory, final int precisionStep) { |
||||
super(new NumericAttributeFactory(factory)); |
||||
if (precisionStep < 1) |
||||
throw new IllegalArgumentException("precisionStep must be >=1"); |
||||
this.precisionStep = precisionStep; |
||||
numericAtt.setShift(-precisionStep); |
||||
} |
||||
|
||||
/** |
||||
* Initializes the token stream with the supplied <code>long</code> value. |
||||
* @param value the value, for which this TokenStream should enumerate tokens. |
||||
* @return this instance, because of this you can use it the following way: |
||||
* <code>new Field(name, new NumericTokenStream(precisionStep).setLongValue(value))</code> |
||||
*/ |
||||
public NumericTokenStream setLongValue(final long value) { |
||||
numericAtt.init(value, valSize = 64, precisionStep, -precisionStep); |
||||
return this; |
||||
} |
||||
|
||||
/** |
||||
* Initializes the token stream with the supplied <code>int</code> value. |
||||
* @param value the value, for which this TokenStream should enumerate tokens. |
||||
* @return this instance, because of this you can use it the following way: |
||||
* <code>new Field(name, new NumericTokenStream(precisionStep).setIntValue(value))</code> |
||||
*/ |
||||
public NumericTokenStream setIntValue(final int value) { |
||||
numericAtt.init(value, valSize = 32, precisionStep, -precisionStep); |
||||
return this; |
||||
} |
||||
|
||||
/** |
||||
* Initializes the token stream with the supplied <code>double</code> value. |
||||
* @param value the value, for which this TokenStream should enumerate tokens. |
||||
* @return this instance, because of this you can use it the following way: |
||||
* <code>new Field(name, new NumericTokenStream(precisionStep).setDoubleValue(value))</code> |
||||
*/ |
||||
public NumericTokenStream setDoubleValue(final double value) { |
||||
numericAtt.init(NumericUtils.doubleToSortableLong(value), valSize = 64, precisionStep, -precisionStep); |
||||
return this; |
||||
} |
||||
|
||||
/** |
||||
* Initializes the token stream with the supplied <code>float</code> value. |
||||
* @param value the value, for which this TokenStream should enumerate tokens. |
||||
* @return this instance, because of this you can use it the following way: |
||||
* <code>new Field(name, new NumericTokenStream(precisionStep).setFloatValue(value))</code> |
||||
*/ |
||||
public NumericTokenStream setFloatValue(final float value) { |
||||
numericAtt.init(NumericUtils.floatToSortableInt(value), valSize = 32, precisionStep, -precisionStep); |
||||
return this; |
||||
} |
||||
|
||||
@Override |
||||
public void reset() { |
||||
if (valSize == 0) |
||||
throw new IllegalStateException("call set???Value() before usage"); |
||||
numericAtt.setShift(-precisionStep); |
||||
} |
||||
|
||||
@Override |
||||
public boolean incrementToken() { |
||||
if (valSize == 0) |
||||
throw new IllegalStateException("call set???Value() before usage"); |
||||
|
||||
// this will only clear all other attributes in this TokenStream
|
||||
clearAttributes(); |
||||
|
||||
final int shift = numericAtt.incShift(); |
||||
typeAtt.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC : TOKEN_TYPE_LOWER_PREC); |
||||
posIncrAtt.setPositionIncrement((shift == 0) ? 1 : 0); |
||||
return (shift < valSize); |
||||
} |
||||
|
||||
/** Returns the precision step. */ |
||||
public int getPrecisionStep() { |
||||
return precisionStep; |
||||
} |
||||
|
||||
// members
|
||||
private final NumericTermAttribute numericAtt = addAttribute(NumericTermAttribute.class); |
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); |
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); |
||||
|
||||
private int valSize = 0; // valSize==0 means not initialized
|
||||
private final int precisionStep; |
||||
} |
@ -0,0 +1,651 @@
|
||||
package com.fr.third.org.apache.lucene.analysis; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.FlagsAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.PayloadAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
||||
import com.fr.third.org.apache.lucene.index.DocsAndPositionsEnum; // for javadoc
|
||||
import com.fr.third.org.apache.lucene.util.Attribute; |
||||
import com.fr.third.org.apache.lucene.util.AttributeSource; |
||||
import com.fr.third.org.apache.lucene.util.AttributeImpl; |
||||
import com.fr.third.org.apache.lucene.util.AttributeReflector; |
||||
import com.fr.third.org.apache.lucene.util.BytesRef; |
||||
|
||||
/** |
||||
A Token is an occurrence of a term from the text of a field. It consists of |
||||
a term's text, the start and end offset of the term in the text of the field, |
||||
and a type string. |
||||
<p> |
||||
The start and end offsets permit applications to re-associate a token with |
||||
its source text, e.g., to display highlighted query terms in a document |
||||
browser, or to show matching text fragments in a <abbr title="KeyWord In Context">KWIC</abbr> |
||||
display, etc. |
||||
<p> |
||||
The type is a string, assigned by a lexical analyzer |
||||
(a.k.a. tokenizer), naming the lexical or syntactic class that the token |
||||
belongs to. For example an end of sentence marker token might be implemented |
||||
with type "eos". The default token type is "word". |
||||
<p> |
||||
A Token can optionally have metadata (a.k.a. payload) in the form of a variable |
||||
length byte array. Use {@link DocsAndPositionsEnum#getPayload()} to retrieve the |
||||
payloads from the index. |
||||
|
||||
<br><br> |
||||
|
||||
<p><b>NOTE:</b> As of 2.9, Token implements all {@link Attribute} interfaces |
||||
that are part of core Lucene and can be found in the {@code tokenattributes} subpackage. |
||||
Even though it is not necessary to use Token anymore, with the new TokenStream API it can |
||||
be used as convenience class that implements all {@link Attribute}s, which is especially useful |
||||
to easily switch from the old to the new TokenStream API. |
||||
|
||||
<br><br> |
||||
|
||||
<p>Tokenizers and TokenFilters should try to re-use a Token |
||||
instance when possible for best performance, by |
||||
implementing the {@link TokenStream#incrementToken()} API. |
||||
Failing that, to create a new Token you should first use |
||||
one of the constructors that starts with null text. To load |
||||
the token from a char[] use {@link #copyBuffer(char[], int, int)}. |
||||
To load from a String use {@link #setEmpty} followed by {@link #append(CharSequence)} or {@link #append(CharSequence, int, int)}. |
||||
Alternatively you can get the Token's termBuffer by calling either {@link #buffer()}, |
||||
if you know that your text is shorter than the capacity of the termBuffer |
||||
or {@link #resizeBuffer(int)}, if there is any possibility |
||||
that you may need to grow the buffer. Fill in the characters of your term into this |
||||
buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string, |
||||
or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setLength(int)} to |
||||
set the length of the term text. See <a target="_top" |
||||
href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a> |
||||
for details.</p> |
||||
<p>Typical Token reuse patterns: |
||||
<ul> |
||||
<li> Copying text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/> |
||||
<pre class="prettyprint"> |
||||
return reusableToken.reinit(string, startOffset, endOffset[, type]); |
||||
</pre> |
||||
</li> |
||||
<li> Copying some text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/> |
||||
<pre class="prettyprint"> |
||||
return reusableToken.reinit(string, 0, string.length(), startOffset, endOffset[, type]); |
||||
</pre> |
||||
</li> |
||||
</li> |
||||
<li> Copying text from char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/> |
||||
<pre class="prettyprint"> |
||||
return reusableToken.reinit(buffer, 0, buffer.length, startOffset, endOffset[, type]); |
||||
</pre> |
||||
</li> |
||||
<li> Copying some text from a char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/> |
||||
<pre class="prettyprint"> |
||||
return reusableToken.reinit(buffer, start, end - start, startOffset, endOffset[, type]); |
||||
</pre> |
||||
</li> |
||||
<li> Copying from one one Token to another (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/> |
||||
<pre class="prettyprint"> |
||||
return reusableToken.reinit(source.buffer(), 0, source.length(), source.startOffset(), source.endOffset()[, source.type()]); |
||||
</pre> |
||||
</li> |
||||
</ul> |
||||
A few things to note: |
||||
<ul> |
||||
<li>clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but should affect no one.</li> |
||||
<li>Because <code>TokenStreams</code> can be chained, one cannot assume that the <code>Token's</code> current type is correct.</li> |
||||
<li>The startOffset and endOffset represent the start and offset in the source text, so be careful in adjusting them.</li> |
||||
<li>When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.</li> |
||||
</ul> |
||||
</p> |
||||
<p> |
||||
<b>Please note:</b> With Lucene 3.1, the <code>{@linkplain #toString toString()}</code> method had to be changed to match the |
||||
{@link CharSequence} interface introduced by the interface {@link com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute}. |
||||
This method now only prints the term text, no additional information anymore. |
||||
</p> |
||||
*/ |
||||
public class Token extends CharTermAttributeImpl |
||||
implements TypeAttribute, PositionIncrementAttribute, |
||||
FlagsAttribute, OffsetAttribute, PayloadAttribute, PositionLengthAttribute { |
||||
|
||||
private int startOffset,endOffset; |
||||
private String type = DEFAULT_TYPE; |
||||
private int flags; |
||||
private BytesRef payload; |
||||
private int positionIncrement = 1; |
||||
private int positionLength = 1; |
||||
|
||||
/** Constructs a Token will null text. */ |
||||
public Token() { |
||||
} |
||||
|
||||
/** Constructs a Token with null text and start & end |
||||
* offsets. |
||||
* @param start start offset in the source text |
||||
* @param end end offset in the source text */ |
||||
public Token(int start, int end) { |
||||
checkOffsets(start, end); |
||||
startOffset = start; |
||||
endOffset = end; |
||||
} |
||||
|
||||
/** Constructs a Token with null text and start & end |
||||
* offsets plus the Token type. |
||||
* @param start start offset in the source text |
||||
* @param end end offset in the source text |
||||
* @param typ the lexical type of this Token */ |
||||
public Token(int start, int end, String typ) { |
||||
checkOffsets(start, end); |
||||
startOffset = start; |
||||
endOffset = end; |
||||
type = typ; |
||||
} |
||||
|
||||
/** |
||||
* Constructs a Token with null text and start & end |
||||
* offsets plus flags. NOTE: flags is EXPERIMENTAL. |
||||
* @param start start offset in the source text |
||||
* @param end end offset in the source text |
||||
* @param flags The bits to set for this token |
||||
*/ |
||||
public Token(int start, int end, int flags) { |
||||
checkOffsets(start, end); |
||||
startOffset = start; |
||||
endOffset = end; |
||||
this.flags = flags; |
||||
} |
||||
|
||||
/** Constructs a Token with the given term text, and start |
||||
* & end offsets. The type defaults to "word." |
||||
* <b>NOTE:</b> for better indexing speed you should |
||||
* instead use the char[] termBuffer methods to set the |
||||
* term text. |
||||
* @param text term text |
||||
* @param start start offset in the source text |
||||
* @param end end offset in the source text |
||||
*/ |
||||
public Token(String text, int start, int end) { |
||||
checkOffsets(start, end); |
||||
append(text); |
||||
startOffset = start; |
||||
endOffset = end; |
||||
} |
||||
|
||||
/** Constructs a Token with the given text, start and end |
||||
* offsets, & type. <b>NOTE:</b> for better indexing |
||||
* speed you should instead use the char[] termBuffer |
||||
* methods to set the term text. |
||||
* @param text term text |
||||
* @param start start offset in the source text |
||||
* @param end end offset in the source text |
||||
* @param typ token type |
||||
*/ |
||||
public Token(String text, int start, int end, String typ) { |
||||
checkOffsets(start, end); |
||||
append(text); |
||||
startOffset = start; |
||||
endOffset = end; |
||||
type = typ; |
||||
} |
||||
|
||||
/** |
||||
* Constructs a Token with the given text, start and end |
||||
* offsets, & type. <b>NOTE:</b> for better indexing |
||||
* speed you should instead use the char[] termBuffer |
||||
* methods to set the term text. |
||||
* @param text term text |
||||
* @param start start offset in the source text |
||||
* @param end end offset in the source text |
||||
* @param flags token type bits |
||||
*/ |
||||
public Token(String text, int start, int end, int flags) { |
||||
checkOffsets(start, end); |
||||
append(text); |
||||
startOffset = start; |
||||
endOffset = end; |
||||
this.flags = flags; |
||||
} |
||||
|
||||
/** |
||||
* Constructs a Token with the given term buffer (offset |
||||
* & length), start and end |
||||
* offsets |
||||
* @param startTermBuffer buffer containing term text |
||||
* @param termBufferOffset the index in the buffer of the first character |
||||
* @param termBufferLength number of valid characters in the buffer |
||||
* @param start start offset in the source text |
||||
* @param end end offset in the source text |
||||
*/ |
||||
public Token(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) { |
||||
checkOffsets(start, end); |
||||
copyBuffer(startTermBuffer, termBufferOffset, termBufferLength); |
||||
startOffset = start; |
||||
endOffset = end; |
||||
} |
||||
|
||||
/** |
||||
* {@inheritDoc} |
||||
* @see PositionIncrementAttribute |
||||
*/ |
||||
public void setPositionIncrement(int positionIncrement) { |
||||
if (positionIncrement < 0) |
||||
throw new IllegalArgumentException |
||||
("Increment must be zero or greater: " + positionIncrement); |
||||
this.positionIncrement = positionIncrement; |
||||
} |
||||
|
||||
/** |
||||
* {@inheritDoc} |
||||
* @see PositionIncrementAttribute |
||||
*/ |
||||
public int getPositionIncrement() { |
||||
return positionIncrement; |
||||
} |
||||
|
||||
/** |
||||
* {@inheritDoc} |
||||
* @see PositionLengthAttribute |
||||
*/ |
||||
@Override |
||||
public void setPositionLength(int positionLength) { |
||||
this.positionLength = positionLength; |
||||
} |
||||
|
||||
/** |
||||
* {@inheritDoc} |
||||
* @see PositionLengthAttribute |
||||
*/ |
||||
@Override |
||||
public int getPositionLength() { |
||||
return positionLength; |
||||
} |
||||
|
||||
/** |
||||
* {@inheritDoc} |
||||
* @see OffsetAttribute |
||||
*/ |
||||
public final int startOffset() { |
||||
return startOffset; |
||||
} |
||||
|
||||
/** |
||||
* {@inheritDoc} |
||||
* @see OffsetAttribute |
||||
*/ |
||||
public final int endOffset() { |
||||
return endOffset; |
||||
} |
||||
|
||||
/** |
||||
* {@inheritDoc} |
||||
* @see OffsetAttribute |
||||
*/ |
||||
public void setOffset(int startOffset, int endOffset) { |
||||
checkOffsets(startOffset, endOffset); |
||||
this.startOffset = startOffset; |
||||
this.endOffset = endOffset; |
||||
} |
||||
|
||||
/** |
||||
* {@inheritDoc} |
||||
* @see TypeAttribute |
||||
*/ |
||||
public final String type() { |
||||
return type; |
||||
} |
||||
|
||||
/** |
||||
* {@inheritDoc} |
||||
* @see TypeAttribute |
||||
*/ |
||||
public final void setType(String type) { |
||||
this.type = type; |
||||
} |
||||
|
||||
/** |
||||
* {@inheritDoc} |
||||
* @see FlagsAttribute |
||||
*/ |
||||
public int getFlags() { |
||||
return flags; |
||||
} |
||||
|
||||
/** |
||||
* {@inheritDoc} |
||||
* @see FlagsAttribute |
||||
*/ |
||||
public void setFlags(int flags) { |
||||
this.flags = flags; |
||||
} |
||||
|
||||
/** |
||||
* {@inheritDoc} |
||||
* @see PayloadAttribute |
||||
*/ |
||||
public BytesRef getPayload() { |
||||
return this.payload; |
||||
} |
||||
|
||||
/** |
||||
* {@inheritDoc} |
||||
* @see PayloadAttribute |
||||
*/ |
||||
public void setPayload(BytesRef payload) { |
||||
this.payload = payload; |
||||
} |
||||
|
||||
/** Resets the term text, payload, flags, and positionIncrement, |
||||
* startOffset, endOffset and token type to default. |
||||
*/ |
||||
@Override |
||||
public void clear() { |
||||
super.clear(); |
||||
payload = null; |
||||
positionIncrement = 1; |
||||
flags = 0; |
||||
startOffset = endOffset = 0; |
||||
type = DEFAULT_TYPE; |
||||
} |
||||
|
||||
@Override |
||||
public Token clone() { |
||||
Token t = (Token)super.clone(); |
||||
// Do a deep clone
|
||||
if (payload != null) { |
||||
t.payload = payload.clone(); |
||||
} |
||||
return t; |
||||
} |
||||
|
||||
/** Makes a clone, but replaces the term buffer & |
||||
* start/end offset in the process. This is more |
||||
* efficient than doing a full clone (and then calling |
||||
* {@link #copyBuffer}) because it saves a wasted copy of the old |
||||
* termBuffer. */ |
||||
public Token clone(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { |
||||
final Token t = new Token(newTermBuffer, newTermOffset, newTermLength, newStartOffset, newEndOffset); |
||||
t.positionIncrement = positionIncrement; |
||||
t.flags = flags; |
||||
t.type = type; |
||||
if (payload != null) |
||||
t.payload = payload.clone(); |
||||
return t; |
||||
} |
||||
|
||||
@Override |
||||
public boolean equals(Object obj) { |
||||
if (obj == this) |
||||
return true; |
||||
|
||||
if (obj instanceof Token) { |
||||
final Token other = (Token) obj; |
||||
return (startOffset == other.startOffset && |
||||
endOffset == other.endOffset && |
||||
flags == other.flags && |
||||
positionIncrement == other.positionIncrement && |
||||
(type == null ? other.type == null : type.equals(other.type)) && |
||||
(payload == null ? other.payload == null : payload.equals(other.payload)) && |
||||
super.equals(obj) |
||||
); |
||||
} else |
||||
return false; |
||||
} |
||||
|
||||
@Override |
||||
public int hashCode() { |
||||
int code = super.hashCode(); |
||||
code = code * 31 + startOffset; |
||||
code = code * 31 + endOffset; |
||||
code = code * 31 + flags; |
||||
code = code * 31 + positionIncrement; |
||||
if (type != null) |
||||
code = code * 31 + type.hashCode(); |
||||
if (payload != null) |
||||
code = code * 31 + payload.hashCode(); |
||||
return code; |
||||
} |
||||
|
||||
// like clear() but doesn't clear termBuffer/text
|
||||
private void clearNoTermBuffer() { |
||||
payload = null; |
||||
positionIncrement = 1; |
||||
flags = 0; |
||||
startOffset = endOffset = 0; |
||||
type = DEFAULT_TYPE; |
||||
} |
||||
|
||||
/** Shorthand for calling {@link #clear}, |
||||
* {@link #copyBuffer(char[], int, int)}, |
||||
* {@link #setOffset}, |
||||
* {@link #setType} |
||||
* @return this Token instance */ |
||||
public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) { |
||||
checkOffsets(newStartOffset, newEndOffset); |
||||
clearNoTermBuffer(); |
||||
copyBuffer(newTermBuffer, newTermOffset, newTermLength); |
||||
payload = null; |
||||
positionIncrement = 1; |
||||
startOffset = newStartOffset; |
||||
endOffset = newEndOffset; |
||||
type = newType; |
||||
return this; |
||||
} |
||||
|
||||
/** Shorthand for calling {@link #clear}, |
||||
* {@link #copyBuffer(char[], int, int)}, |
||||
* {@link #setOffset}, |
||||
* {@link #setType} on Token.DEFAULT_TYPE |
||||
* @return this Token instance */ |
||||
public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { |
||||
checkOffsets(newStartOffset, newEndOffset); |
||||
clearNoTermBuffer(); |
||||
copyBuffer(newTermBuffer, newTermOffset, newTermLength); |
||||
startOffset = newStartOffset; |
||||
endOffset = newEndOffset; |
||||
type = DEFAULT_TYPE; |
||||
return this; |
||||
} |
||||
|
||||
/** Shorthand for calling {@link #clear}, |
||||
* {@link #append(CharSequence)}, |
||||
* {@link #setOffset}, |
||||
* {@link #setType} |
||||
* @return this Token instance */ |
||||
public Token reinit(String newTerm, int newStartOffset, int newEndOffset, String newType) { |
||||
checkOffsets(newStartOffset, newEndOffset); |
||||
clear(); |
||||
append(newTerm); |
||||
startOffset = newStartOffset; |
||||
endOffset = newEndOffset; |
||||
type = newType; |
||||
return this; |
||||
} |
||||
|
||||
/** Shorthand for calling {@link #clear}, |
||||
* {@link #append(CharSequence, int, int)}, |
||||
* {@link #setOffset}, |
||||
* {@link #setType} |
||||
* @return this Token instance */ |
||||
public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) { |
||||
checkOffsets(newStartOffset, newEndOffset); |
||||
clear(); |
||||
append(newTerm, newTermOffset, newTermOffset + newTermLength); |
||||
startOffset = newStartOffset; |
||||
endOffset = newEndOffset; |
||||
type = newType; |
||||
return this; |
||||
} |
||||
|
||||
/** Shorthand for calling {@link #clear}, |
||||
* {@link #append(CharSequence)}, |
||||
* {@link #setOffset}, |
||||
* {@link #setType} on Token.DEFAULT_TYPE |
||||
* @return this Token instance */ |
||||
public Token reinit(String newTerm, int newStartOffset, int newEndOffset) { |
||||
checkOffsets(newStartOffset, newEndOffset); |
||||
clear(); |
||||
append(newTerm); |
||||
startOffset = newStartOffset; |
||||
endOffset = newEndOffset; |
||||
type = DEFAULT_TYPE; |
||||
return this; |
||||
} |
||||
|
||||
/** Shorthand for calling {@link #clear}, |
||||
* {@link #append(CharSequence, int, int)}, |
||||
* {@link #setOffset}, |
||||
* {@link #setType} on Token.DEFAULT_TYPE |
||||
* @return this Token instance */ |
||||
public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { |
||||
checkOffsets(newStartOffset, newEndOffset); |
||||
clear(); |
||||
append(newTerm, newTermOffset, newTermOffset + newTermLength); |
||||
startOffset = newStartOffset; |
||||
endOffset = newEndOffset; |
||||
type = DEFAULT_TYPE; |
||||
return this; |
||||
} |
||||
|
||||
/** |
||||
* Copy the prototype token's fields into this one. Note: Payloads are shared. |
||||
* @param prototype source Token to copy fields from |
||||
*/ |
||||
public void reinit(Token prototype) { |
||||
copyBuffer(prototype.buffer(), 0, prototype.length()); |
||||
positionIncrement = prototype.positionIncrement; |
||||
flags = prototype.flags; |
||||
startOffset = prototype.startOffset; |
||||
endOffset = prototype.endOffset; |
||||
type = prototype.type; |
||||
payload = prototype.payload; |
||||
} |
||||
|
||||
/** |
||||
* Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared. |
||||
* @param prototype existing Token |
||||
* @param newTerm new term text |
||||
*/ |
||||
public void reinit(Token prototype, String newTerm) { |
||||
setEmpty().append(newTerm); |
||||
positionIncrement = prototype.positionIncrement; |
||||
flags = prototype.flags; |
||||
startOffset = prototype.startOffset; |
||||
endOffset = prototype.endOffset; |
||||
type = prototype.type; |
||||
payload = prototype.payload; |
||||
} |
||||
|
||||
/** |
||||
* Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared. |
||||
* @param prototype existing Token |
||||
* @param newTermBuffer buffer containing new term text |
||||
* @param offset the index in the buffer of the first character |
||||
* @param length number of valid characters in the buffer |
||||
*/ |
||||
public void reinit(Token prototype, char[] newTermBuffer, int offset, int length) { |
||||
copyBuffer(newTermBuffer, offset, length); |
||||
positionIncrement = prototype.positionIncrement; |
||||
flags = prototype.flags; |
||||
startOffset = prototype.startOffset; |
||||
endOffset = prototype.endOffset; |
||||
type = prototype.type; |
||||
payload = prototype.payload; |
||||
} |
||||
|
||||
@Override |
||||
public void copyTo(AttributeImpl target) { |
||||
if (target instanceof Token) { |
||||
final Token to = (Token) target; |
||||
to.reinit(this); |
||||
// reinit shares the payload, so clone it:
|
||||
if (payload !=null) { |
||||
to.payload = payload.clone(); |
||||
} |
||||
} else { |
||||
super.copyTo(target); |
||||
((OffsetAttribute) target).setOffset(startOffset, endOffset); |
||||
((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement); |
||||
((PayloadAttribute) target).setPayload((payload == null) ? null : payload.clone()); |
||||
((FlagsAttribute) target).setFlags(flags); |
||||
((TypeAttribute) target).setType(type); |
||||
} |
||||
} |
||||
|
||||
@Override |
||||
public void reflectWith(AttributeReflector reflector) { |
||||
super.reflectWith(reflector); |
||||
reflector.reflect(OffsetAttribute.class, "startOffset", startOffset); |
||||
reflector.reflect(OffsetAttribute.class, "endOffset", endOffset); |
||||
reflector.reflect(PositionIncrementAttribute.class, "positionIncrement", positionIncrement); |
||||
reflector.reflect(PayloadAttribute.class, "payload", payload); |
||||
reflector.reflect(FlagsAttribute.class, "flags", flags); |
||||
reflector.reflect(TypeAttribute.class, "type", type); |
||||
} |
||||
|
||||
private void checkOffsets(int startOffset, int endOffset) { |
||||
if (startOffset < 0 || endOffset < startOffset) { |
||||
throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, " |
||||
+ "startOffset=" + startOffset + ",endOffset=" + endOffset); |
||||
} |
||||
} |
||||
|
||||
/** Convenience factory that returns <code>Token</code> as implementation for the basic |
||||
* attributes and return the default impl (with "Impl" appended) for all other |
||||
* attributes. |
||||
* @since 3.0 |
||||
*/ |
||||
public static final AttributeSource.AttributeFactory TOKEN_ATTRIBUTE_FACTORY = |
||||
new TokenAttributeFactory(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); |
||||
|
||||
/** <b>Expert:</b> Creates a TokenAttributeFactory returning {@link Token} as instance for the basic attributes |
||||
* and for all other attributes calls the given delegate factory. |
||||
* @since 3.0 |
||||
*/ |
||||
public static final class TokenAttributeFactory extends AttributeSource.AttributeFactory { |
||||
|
||||
private final AttributeSource.AttributeFactory delegate; |
||||
|
||||
/** <b>Expert</b>: Creates an AttributeFactory returning {@link Token} as instance for the basic attributes |
||||
* and for all other attributes calls the given delegate factory. */ |
||||
public TokenAttributeFactory(AttributeSource.AttributeFactory delegate) { |
||||
this.delegate = delegate; |
||||
} |
||||
|
||||
@Override |
||||
public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) { |
||||
return attClass.isAssignableFrom(Token.class) |
||||
? new Token() : delegate.createAttributeInstance(attClass); |
||||
} |
||||
|
||||
@Override |
||||
public boolean equals(Object other) { |
||||
if (this == other) return true; |
||||
if (other instanceof TokenAttributeFactory) { |
||||
final TokenAttributeFactory af = (TokenAttributeFactory) other; |
||||
return this.delegate.equals(af.delegate); |
||||
} |
||||
return false; |
||||
} |
||||
|
||||
@Override |
||||
public int hashCode() { |
||||
return delegate.hashCode() ^ 0x0a45aa31; |
||||
} |
||||
} |
||||
|
||||
} |
@ -0,0 +1,72 @@
|
||||
package com.fr.third.org.apache.lucene.analysis; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.IOException; |
||||
|
||||
/** A TokenFilter is a TokenStream whose input is another TokenStream. |
||||
<p> |
||||
This is an abstract class; subclasses must override {@link #incrementToken()}. |
||||
@see TokenStream |
||||
*/ |
||||
public abstract class TokenFilter extends TokenStream { |
||||
/** The source of tokens for this filter. */ |
||||
protected final TokenStream input; |
||||
|
||||
/** Construct a token stream filtering the given input. */ |
||||
protected TokenFilter(TokenStream input) { |
||||
super(input); |
||||
this.input = input; |
||||
} |
||||
|
||||
/** |
||||
* {@inheritDoc} |
||||
* <p> |
||||
* <b>NOTE:</b> |
||||
* The default implementation chains the call to the input TokenStream, so |
||||
* be sure to call <code>super.end()</code> first when overriding this method. |
||||
*/ |
||||
@Override |
||||
public void end() throws IOException { |
||||
input.end(); |
||||
} |
||||
|
||||
/** |
||||
* {@inheritDoc} |
||||
* <p> |
||||
* <b>NOTE:</b> |
||||
* The default implementation chains the call to the input TokenStream, so |
||||
* be sure to call <code>super.close()</code> when overriding this method. |
||||
*/ |
||||
@Override |
||||
public void close() throws IOException { |
||||
input.close(); |
||||
} |
||||
|
||||
/** |
||||
* {@inheritDoc} |
||||
* <p> |
||||
* <b>NOTE:</b> |
||||
* The default implementation chains the call to the input TokenStream, so |
||||
* be sure to call <code>super.reset()</code> when overriding this method. |
||||
*/ |
||||
@Override |
||||
public void reset() throws IOException { |
||||
input.reset(); |
||||
} |
||||
} |
@ -0,0 +1,181 @@
|
||||
package com.fr.third.org.apache.lucene.analysis; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.IOException; |
||||
import java.io.Closeable; |
||||
import java.lang.reflect.Modifier; |
||||
|
||||
import com.fr.third.org.apache.lucene.document.Document; |
||||
import com.fr.third.org.apache.lucene.document.Field; |
||||
import com.fr.third.org.apache.lucene.index.IndexWriter; |
||||
import com.fr.third.org.apache.lucene.util.Attribute; |
||||
import com.fr.third.org.apache.lucene.util.AttributeImpl; |
||||
import com.fr.third.org.apache.lucene.util.AttributeSource; |
||||
|
||||
/** |
||||
* A <code>TokenStream</code> enumerates the sequence of tokens, either from |
||||
* {@link Field}s of a {@link Document} or from query text. |
||||
* <p> |
||||
* This is an abstract class; concrete subclasses are: |
||||
* <ul> |
||||
* <li>{@link Tokenizer}, a <code>TokenStream</code> whose input is a Reader; and |
||||
* <li>{@link TokenFilter}, a <code>TokenStream</code> whose input is another |
||||
* <code>TokenStream</code>. |
||||
* </ul> |
||||
* A new <code>TokenStream</code> API has been introduced with Lucene 2.9. This API |
||||
* has moved from being {@link Token}-based to {@link Attribute}-based. While |
||||
* {@link Token} still exists in 2.9 as a convenience class, the preferred way |
||||
* to store the information of a {@link Token} is to use {@link AttributeImpl}s. |
||||
* <p> |
||||
* <code>TokenStream</code> now extends {@link AttributeSource}, which provides |
||||
* access to all of the token {@link Attribute}s for the <code>TokenStream</code>. |
||||
* Note that only one instance per {@link AttributeImpl} is created and reused |
||||
* for every token. This approach reduces object creation and allows local |
||||
* caching of references to the {@link AttributeImpl}s. See |
||||
* {@link #incrementToken()} for further details. |
||||
* <p> |
||||
* <b>The workflow of the new <code>TokenStream</code> API is as follows:</b> |
||||
* <ol> |
||||
* <li>Instantiation of <code>TokenStream</code>/{@link TokenFilter}s which add/get |
||||
* attributes to/from the {@link AttributeSource}. |
||||
* <li>The consumer calls {@link TokenStream#reset()}. |
||||
* <li>The consumer retrieves attributes from the stream and stores local |
||||
* references to all attributes it wants to access. |
||||
* <li>The consumer calls {@link #incrementToken()} until it returns false |
||||
* consuming the attributes after each call. |
||||
* <li>The consumer calls {@link #end()} so that any end-of-stream operations |
||||
* can be performed. |
||||
* <li>The consumer calls {@link #close()} to release any resource when finished |
||||
* using the <code>TokenStream</code>. |
||||
* </ol> |
||||
* To make sure that filters and consumers know which attributes are available, |
||||
* the attributes must be added during instantiation. Filters and consumers are |
||||
* not required to check for availability of attributes in |
||||
* {@link #incrementToken()}. |
||||
* <p> |
||||
* You can find some example code for the new API in the analysis package level |
||||
* Javadoc. |
||||
* <p> |
||||
* Sometimes it is desirable to capture a current state of a <code>TokenStream</code>, |
||||
* e.g., for buffering purposes (see {@link CachingTokenFilter}, |
||||
* TeeSinkTokenFilter). For this usecase |
||||
* {@link AttributeSource#captureState} and {@link AttributeSource#restoreState} |
||||
* can be used. |
||||
* <p>The {@code TokenStream}-API in Lucene is based on the decorator pattern. |
||||
* Therefore all non-abstract subclasses must be final or have at least a final |
||||
* implementation of {@link #incrementToken}! This is checked when Java |
||||
* assertions are enabled. |
||||
*/ |
||||
public abstract class TokenStream extends AttributeSource implements Closeable { |
||||
|
||||
/** |
||||
* A TokenStream using the default attribute factory. |
||||
*/ |
||||
protected TokenStream() { |
||||
super(); |
||||
assert assertFinal(); |
||||
} |
||||
|
||||
/** |
||||
* A TokenStream that uses the same attributes as the supplied one. |
||||
*/ |
||||
protected TokenStream(AttributeSource input) { |
||||
super(input); |
||||
assert assertFinal(); |
||||
} |
||||
|
||||
/** |
||||
* A TokenStream using the supplied AttributeFactory for creating new {@link Attribute} instances. |
||||
*/ |
||||
protected TokenStream(AttributeFactory factory) { |
||||
super(factory); |
||||
assert assertFinal(); |
||||
} |
||||
|
||||
private boolean assertFinal() { |
||||
try { |
||||
final Class<?> clazz = getClass(); |
||||
if (!clazz.desiredAssertionStatus()) |
||||
return true; |
||||
assert clazz.isAnonymousClass() || |
||||
(clazz.getModifiers() & (Modifier.FINAL | Modifier.PRIVATE)) != 0 || |
||||
Modifier.isFinal(clazz.getMethod("incrementToken").getModifiers()) : |
||||
"TokenStream implementation classes or at least their incrementToken() implementation must be final"; |
||||
return true; |
||||
} catch (NoSuchMethodException nsme) { |
||||
return false; |
||||
} |
||||
} |
||||
|
||||
/** |
||||
* Consumers (i.e., {@link IndexWriter}) use this method to advance the stream to |
||||
* the next token. Implementing classes must implement this method and update |
||||
* the appropriate {@link AttributeImpl}s with the attributes of the next |
||||
* token. |
||||
* <P> |
||||
* The producer must make no assumptions about the attributes after the method |
||||
* has been returned: the caller may arbitrarily change it. If the producer |
||||
* needs to preserve the state for subsequent calls, it can use |
||||
* {@link #captureState} to create a copy of the current attribute state. |
||||
* <p> |
||||
* This method is called for every token of a document, so an efficient |
||||
* implementation is crucial for good performance. To avoid calls to |
||||
* {@link #addAttribute(Class)} and {@link #getAttribute(Class)}, |
||||
* references to all {@link AttributeImpl}s that this stream uses should be |
||||
* retrieved during instantiation. |
||||
* <p> |
||||
* To ensure that filters and consumers know which attributes are available, |
||||
* the attributes must be added during instantiation. Filters and consumers |
||||
* are not required to check for availability of attributes in |
||||
* {@link #incrementToken()}. |
||||
* |
||||
* @return false for end of stream; true otherwise |
||||
*/ |
||||
public abstract boolean incrementToken() throws IOException; |
||||
|
||||
/** |
||||
* This method is called by the consumer after the last token has been |
||||
* consumed, after {@link #incrementToken()} returned <code>false</code> |
||||
* (using the new <code>TokenStream</code> API). Streams implementing the old API |
||||
* should upgrade to use this feature. |
||||
* <p/> |
||||
* This method can be used to perform any end-of-stream operations, such as |
||||
* setting the final offset of a stream. The final offset of a stream might |
||||
* differ from the offset of the last token eg in case one or more whitespaces |
||||
* followed after the last token, but a WhitespaceTokenizer was used. |
||||
* |
||||
* @throws IOException If an I/O error occurs |
||||
*/ |
||||
public void end() throws IOException { |
||||
// do nothing by default
|
||||
} |
||||
|
||||
/** |
||||
* This method is called by a consumer before it begins consumption using |
||||
* {@link #incrementToken()}. |
||||
* <p/> |
||||
* Resets this stream to a clean state. Stateful implementations must implement |
||||
* this method so that they can be reused, just as if they had been created fresh. |
||||
*/ |
||||
public void reset() throws IOException {} |
||||
|
||||
/** Releases resources associated with this stream. */ |
||||
public void close() throws IOException {} |
||||
|
||||
} |
@ -0,0 +1,99 @@
|
||||
package com.fr.third.org.apache.lucene.analysis; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import com.fr.third.org.apache.lucene.util.AttributeSource; |
||||
|
||||
import java.io.Reader; |
||||
import java.io.IOException; |
||||
|
||||
/** A Tokenizer is a TokenStream whose input is a Reader. |
||||
<p> |
||||
This is an abstract class; subclasses must override {@link #incrementToken()} |
||||
<p> |
||||
NOTE: Subclasses overriding {@link #incrementToken()} must |
||||
call {@link AttributeSource#clearAttributes()} before |
||||
setting attributes. |
||||
*/ |
||||
public abstract class Tokenizer extends TokenStream { |
||||
/** The text source for this Tokenizer. */ |
||||
protected Reader input; |
||||
|
||||
/** Construct a token stream processing the given input. */ |
||||
protected Tokenizer(Reader input) { |
||||
assert input != null: "input must not be null"; |
||||
this.input = input; |
||||
} |
||||
|
||||
/** Construct a token stream processing the given input using the given AttributeFactory. */ |
||||
protected Tokenizer(AttributeFactory factory, Reader input) { |
||||
super(factory); |
||||
assert input != null: "input must not be null"; |
||||
this.input = input; |
||||
} |
||||
|
||||
/** Construct a token stream processing the given input using the given AttributeSource. */ |
||||
protected Tokenizer(AttributeSource source, Reader input) { |
||||
super(source); |
||||
assert input != null: "input must not be null"; |
||||
this.input = input; |
||||
} |
||||
|
||||
/** |
||||
* {@inheritDoc} |
||||
* <p> |
||||
* <b>NOTE:</b> |
||||
* The default implementation closes the input Reader, so |
||||
* be sure to call <code>super.close()</code> when overriding this method. |
||||
*/ |
||||
@Override |
||||
public void close() throws IOException { |
||||
if (input != null) { |
||||
input.close(); |
||||
// LUCENE-2387: don't hold onto Reader after close, so
|
||||
// GC can reclaim
|
||||
input = null; |
||||
} |
||||
} |
||||
|
||||
/** Return the corrected offset. If {@link #input} is a {@link CharFilter} subclass |
||||
* this method calls {@link CharFilter#correctOffset}, else returns <code>currentOff</code>. |
||||
* @param currentOff offset as seen in the output |
||||
* @return corrected offset based on the input |
||||
* @see CharFilter#correctOffset |
||||
*/ |
||||
protected final int correctOffset(int currentOff) { |
||||
assert input != null: "this tokenizer is closed"; |
||||
return (input instanceof CharFilter) ? ((CharFilter) input).correctOffset(currentOff) : currentOff; |
||||
} |
||||
|
||||
/** Expert: Set a new reader on the Tokenizer. Typically, an |
||||
* analyzer (in its tokenStream method) will use |
||||
* this to re-use a previously created tokenizer. */ |
||||
public final void setReader(Reader input) throws IOException { |
||||
assert input != null: "input must not be null"; |
||||
this.input = input; |
||||
assert setReaderTestPoint(); |
||||
} |
||||
|
||||
// only used by assert, for testing
|
||||
boolean setReaderTestPoint() { |
||||
return true; |
||||
} |
||||
} |
||||
|
@ -0,0 +1,153 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.ar; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.IOException; |
||||
import java.io.Reader; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.Analyzer; |
||||
import com.fr.third.org.apache.lucene.analysis.core.LowerCaseFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.core.StopFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer; |
||||
import com.fr.third.org.apache.lucene.analysis.util.CharArraySet; |
||||
import com.fr.third.org.apache.lucene.analysis.util.StopwordAnalyzerBase; |
||||
import com.fr.third.org.apache.lucene.analysis.TokenStream; |
||||
import com.fr.third.org.apache.lucene.analysis.Tokenizer; |
||||
import com.fr.third.org.apache.lucene.util.Version; |
||||
|
||||
/** |
||||
* {@link Analyzer} for Arabic. |
||||
* <p> |
||||
* This analyzer implements light-stemming as specified by: |
||||
* <i> |
||||
* Light Stemming for Arabic Information Retrieval |
||||
* </i> |
||||
* http://www.mtholyoke.edu/~lballest/Pubs/arab_stem05.pdf
|
||||
* <p> |
||||
* The analysis package contains three primary components: |
||||
* <ul> |
||||
* <li>{@link ArabicNormalizationFilter}: Arabic orthographic normalization. |
||||
* <li>{@link ArabicStemFilter}: Arabic light stemming |
||||
* <li>Arabic stop words file: a set of default Arabic stop words. |
||||
* </ul> |
||||
* |
||||
*/ |
||||
public final class ArabicAnalyzer extends StopwordAnalyzerBase { |
||||
|
||||
/** |
||||
* File containing default Arabic stopwords. |
||||
* |
||||
* Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html
|
||||
* The stopword list is BSD-Licensed. |
||||
*/ |
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; |
||||
|
||||
/** |
||||
* Returns an unmodifiable instance of the default stop-words set. |
||||
* @return an unmodifiable instance of the default stop-words set. |
||||
*/ |
||||
public static CharArraySet getDefaultStopSet(){ |
||||
return DefaultSetHolder.DEFAULT_STOP_SET; |
||||
} |
||||
|
||||
/** |
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.; |
||||
*/ |
||||
private static class DefaultSetHolder { |
||||
static final CharArraySet DEFAULT_STOP_SET; |
||||
|
||||
static { |
||||
try { |
||||
DEFAULT_STOP_SET = loadStopwordSet(false, ArabicAnalyzer.class, DEFAULT_STOPWORD_FILE, "#"); |
||||
} catch (IOException ex) { |
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set"); |
||||
} |
||||
} |
||||
} |
||||
|
||||
private final CharArraySet stemExclusionSet; |
||||
|
||||
/** |
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. |
||||
*/ |
||||
public ArabicAnalyzer(Version matchVersion) { |
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); |
||||
} |
||||
|
||||
/** |
||||
* Builds an analyzer with the given stop words |
||||
* |
||||
* @param matchVersion |
||||
* lucene compatibility version |
||||
* @param stopwords |
||||
* a stopword set |
||||
*/ |
||||
public ArabicAnalyzer(Version matchVersion, CharArraySet stopwords){ |
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); |
||||
} |
||||
|
||||
/** |
||||
* Builds an analyzer with the given stop word. If a none-empty stem exclusion set is |
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before |
||||
* {@link ArabicStemFilter}. |
||||
* |
||||
* @param matchVersion |
||||
* lucene compatibility version |
||||
* @param stopwords |
||||
* a stopword set |
||||
* @param stemExclusionSet |
||||
* a set of terms not to be stemmed |
||||
*/ |
||||
public ArabicAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){ |
||||
super(matchVersion, stopwords); |
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( |
||||
matchVersion, stemExclusionSet)); |
||||
} |
||||
|
||||
/** |
||||
* Creates |
||||
* {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents} |
||||
* used to tokenize all the text in the provided {@link Reader}. |
||||
* |
||||
* @return {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents} |
||||
* built from an {@link StandardTokenizer} filtered with |
||||
* {@link LowerCaseFilter}, {@link StopFilter}, |
||||
* {@link ArabicNormalizationFilter}, {@link KeywordMarkerFilter} |
||||
* if a stem exclusion set is provided and {@link ArabicStemFilter}. |
||||
*/ |
||||
@Override |
||||
protected TokenStreamComponents createComponents(String fieldName, |
||||
Reader reader) { |
||||
final Tokenizer source = matchVersion.onOrAfter(Version.LUCENE_31) ? |
||||
new StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader); |
||||
TokenStream result = new LowerCaseFilter(matchVersion, source); |
||||
// the order here is important: the stopword list is not normalized!
|
||||
result = new StopFilter( matchVersion, result, stopwords); |
||||
// TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?!
|
||||
result = new ArabicNormalizationFilter(result); |
||||
if(!stemExclusionSet.isEmpty()) { |
||||
result = new KeywordMarkerFilter(result, stemExclusionSet); |
||||
} |
||||
return new TokenStreamComponents(source, new ArabicStemFilter(result)); |
||||
} |
||||
} |
||||
|
@ -0,0 +1,96 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.ar; |
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.Reader; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.core.LetterTokenizer; |
||||
import com.fr.third.org.apache.lucene.analysis.util.CharTokenizer; |
||||
import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer; // javadoc @link
|
||||
import com.fr.third.org.apache.lucene.util.AttributeSource; |
||||
import com.fr.third.org.apache.lucene.util.Version; |
||||
|
||||
/** |
||||
* Tokenizer that breaks text into runs of letters and diacritics. |
||||
* <p> |
||||
* The problem with the standard Letter tokenizer is that it fails on diacritics. |
||||
* Handling similar to this is necessary for Indic Scripts, Hebrew, Thaana, etc. |
||||
* </p> |
||||
* <p> |
||||
* <a name="version"/> |
||||
* You must specify the required {@link Version} compatibility when creating |
||||
* {@link ArabicLetterTokenizer}: |
||||
* <ul> |
||||
* <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and |
||||
* detect token characters. See {@link #isTokenChar(int)} and |
||||
* {@link #normalize(int)} for details.</li> |
||||
* </ul> |
||||
* @deprecated (3.1) Use {@link StandardTokenizer} instead. |
||||
*/ |
||||
@Deprecated |
||||
public class ArabicLetterTokenizer extends LetterTokenizer { |
||||
/** |
||||
* Construct a new ArabicLetterTokenizer. |
||||
* @param matchVersion Lucene version |
||||
* to match See {@link <a href="#version">above</a>} |
||||
* |
||||
* @param in |
||||
* the input to split up into tokens |
||||
*/ |
||||
public ArabicLetterTokenizer(Version matchVersion, Reader in) { |
||||
super(matchVersion, in); |
||||
} |
||||
|
||||
/** |
||||
* Construct a new ArabicLetterTokenizer using a given {@link AttributeSource}. |
||||
* |
||||
* @param matchVersion |
||||
* Lucene version to match See {@link <a href="#version">above</a>} |
||||
* @param source |
||||
* the attribute source to use for this Tokenizer |
||||
* @param in |
||||
* the input to split up into tokens |
||||
*/ |
||||
public ArabicLetterTokenizer(Version matchVersion, AttributeSource source, Reader in) { |
||||
super(matchVersion, source, in); |
||||
} |
||||
|
||||
/** |
||||
* Construct a new ArabicLetterTokenizer using a given |
||||
* {@link AttributeSource.AttributeFactory}. * @param |
||||
* matchVersion Lucene version to match See |
||||
* {@link <a href="#version">above</a>} |
||||
* |
||||
* @param factory |
||||
* the attribute factory to use for this Tokenizer |
||||
* @param in |
||||
* the input to split up into tokens |
||||
*/ |
||||
public ArabicLetterTokenizer(Version matchVersion, AttributeFactory factory, Reader in) { |
||||
super(matchVersion, factory, in); |
||||
} |
||||
|
||||
/** |
||||
* Allows for Letter category or NonspacingMark category |
||||
* @see com.fr.third.org.apache.lucene.analysis.core.LetterTokenizer#isTokenChar(int) |
||||
*/ |
||||
@Override |
||||
protected boolean isTokenChar(int c) { |
||||
return super.isTokenChar(c) || Character.getType(c) == Character.NON_SPACING_MARK; |
||||
} |
||||
|
||||
} |
@ -0,0 +1,43 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.ar; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.ar.ArabicLetterTokenizer; |
||||
import com.fr.third.org.apache.lucene.analysis.util.TokenizerFactory; |
||||
|
||||
import java.io.Reader; |
||||
import java.util.Map; |
||||
|
||||
|
||||
/** |
||||
* Factory for {@link ArabicLetterTokenizer} |
||||
* @deprecated (3.1) Use StandardTokenizerFactory instead. |
||||
**/ |
||||
@Deprecated |
||||
public class ArabicLetterTokenizerFactory extends TokenizerFactory { |
||||
|
||||
@Override |
||||
public void init(Map<String,String> args) { |
||||
super.init(args); |
||||
assureMatchVersion(); |
||||
} |
||||
|
||||
public ArabicLetterTokenizer create(Reader input) { |
||||
return new ArabicLetterTokenizer(luceneMatchVersion, input); |
||||
} |
||||
} |
@ -0,0 +1,48 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.ar; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.IOException; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.TokenFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.TokenStream; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
||||
|
||||
/** |
||||
* A {@link TokenFilter} that applies {@link ArabicNormalizer} to normalize the orthography. |
||||
* |
||||
*/ |
||||
|
||||
public final class ArabicNormalizationFilter extends TokenFilter { |
||||
private final ArabicNormalizer normalizer = new ArabicNormalizer(); |
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
||||
|
||||
public ArabicNormalizationFilter(TokenStream input) { |
||||
super(input); |
||||
} |
||||
|
||||
@Override |
||||
public boolean incrementToken() throws IOException { |
||||
if (input.incrementToken()) { |
||||
int newlen = normalizer.normalize(termAtt.buffer(), termAtt.length()); |
||||
termAtt.setLength(newlen); |
||||
return true; |
||||
} |
||||
return false; |
||||
} |
||||
} |
@ -0,0 +1,48 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.ar; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.TokenStream; |
||||
import com.fr.third.org.apache.lucene.analysis.ar.ArabicNormalizationFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.util.AbstractAnalysisFactory; |
||||
import com.fr.third.org.apache.lucene.analysis.util.MultiTermAwareComponent; |
||||
import com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory; |
||||
|
||||
|
||||
/** |
||||
* Factory for {@link ArabicNormalizationFilter}. |
||||
* <pre class="prettyprint" > |
||||
* <fieldType name="text_arnormal" class="solr.TextField" positionIncrementGap="100"> |
||||
* <analyzer> |
||||
* <tokenizer class="solr.StandardTokenizerFactory"/> |
||||
* <filter class="solr.ArabicNormalizationFilterFactory"/> |
||||
* </analyzer> |
||||
* </fieldType></pre> |
||||
* |
||||
*/ |
||||
public class ArabicNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { |
||||
|
||||
public ArabicNormalizationFilter create(TokenStream input) { |
||||
return new ArabicNormalizationFilter(input); |
||||
} |
||||
|
||||
@Override |
||||
public AbstractAnalysisFactory getMultiTermComponent() { |
||||
return this; |
||||
} |
||||
} |
@ -0,0 +1,101 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.ar; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import static com.fr.third.org.apache.lucene.analysis.util.StemmerUtil.*; |
||||
|
||||
/** |
||||
* Normalizer for Arabic. |
||||
* <p> |
||||
* Normalization is done in-place for efficiency, operating on a termbuffer. |
||||
* <p> |
||||
* Normalization is defined as: |
||||
* <ul> |
||||
* <li> Normalization of hamza with alef seat to a bare alef. |
||||
* <li> Normalization of teh marbuta to heh |
||||
* <li> Normalization of dotless yeh (alef maksura) to yeh. |
||||
* <li> Removal of Arabic diacritics (the harakat) |
||||
* <li> Removal of tatweel (stretching character). |
||||
* </ul> |
||||
* |
||||
*/ |
||||
public class ArabicNormalizer { |
||||
public static final char ALEF = '\u0627'; |
||||
public static final char ALEF_MADDA = '\u0622'; |
||||
public static final char ALEF_HAMZA_ABOVE = '\u0623'; |
||||
public static final char ALEF_HAMZA_BELOW = '\u0625'; |
||||
|
||||
public static final char YEH = '\u064A'; |
||||
public static final char DOTLESS_YEH = '\u0649'; |
||||
|
||||
public static final char TEH_MARBUTA = '\u0629'; |
||||
public static final char HEH = '\u0647'; |
||||
|
||||
public static final char TATWEEL = '\u0640'; |
||||
|
||||
public static final char FATHATAN = '\u064B'; |
||||
public static final char DAMMATAN = '\u064C'; |
||||
public static final char KASRATAN = '\u064D'; |
||||
public static final char FATHA = '\u064E'; |
||||
public static final char DAMMA = '\u064F'; |
||||
public static final char KASRA = '\u0650'; |
||||
public static final char SHADDA = '\u0651'; |
||||
public static final char SUKUN = '\u0652'; |
||||
|
||||
/** |
||||
* Normalize an input buffer of Arabic text |
||||
* |
||||
* @param s input buffer |
||||
* @param len length of input buffer |
||||
* @return length of input buffer after normalization |
||||
*/ |
||||
public int normalize(char s[], int len) { |
||||
|
||||
for (int i = 0; i < len; i++) { |
||||
switch (s[i]) { |
||||
case ALEF_MADDA: |
||||
case ALEF_HAMZA_ABOVE: |
||||
case ALEF_HAMZA_BELOW: |
||||
s[i] = ALEF; |
||||
break; |
||||
case DOTLESS_YEH: |
||||
s[i] = YEH; |
||||
break; |
||||
case TEH_MARBUTA: |
||||
s[i] = HEH; |
||||
break; |
||||
case TATWEEL: |
||||
case KASRATAN: |
||||
case DAMMATAN: |
||||
case FATHATAN: |
||||
case FATHA: |
||||
case DAMMA: |
||||
case KASRA: |
||||
case SHADDA: |
||||
case SUKUN: |
||||
len = delete(s, i, len); |
||||
i--; |
||||
break; |
||||
default: |
||||
break; |
||||
} |
||||
} |
||||
|
||||
return len; |
||||
} |
||||
} |
@ -0,0 +1,58 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.ar; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.IOException; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // javadoc @link
|
||||
import com.fr.third.org.apache.lucene.analysis.TokenFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.TokenStream; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.KeywordAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
||||
|
||||
/** |
||||
* A {@link TokenFilter} that applies {@link ArabicStemmer} to stem Arabic words.. |
||||
* <p> |
||||
* To prevent terms from being stemmed use an instance of |
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets |
||||
* the {@link KeywordAttribute} before this {@link TokenStream}. |
||||
* </p> |
||||
* @see KeywordMarkerFilter */ |
||||
|
||||
public final class ArabicStemFilter extends TokenFilter { |
||||
private final ArabicStemmer stemmer = new ArabicStemmer(); |
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); |
||||
|
||||
public ArabicStemFilter(TokenStream input) { |
||||
super(input); |
||||
} |
||||
|
||||
@Override |
||||
public boolean incrementToken() throws IOException { |
||||
if (input.incrementToken()) { |
||||
if(!keywordAttr.isKeyword()) { |
||||
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); |
||||
termAtt.setLength(newlen); |
||||
} |
||||
return true; |
||||
} else { |
||||
return false; |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,43 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.ar; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.TokenStream; |
||||
import com.fr.third.org.apache.lucene.analysis.ar.ArabicStemFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory; |
||||
|
||||
|
||||
/** |
||||
* Factory for {@link ArabicStemFilter}. |
||||
* <pre class="prettyprint" > |
||||
* <fieldType name="text_arstem" class="solr.TextField" positionIncrementGap="100"> |
||||
* <analyzer> |
||||
* <tokenizer class="solr.StandardTokenizerFactory"/> |
||||
* <filter class="solr.ArabicNormalizationFilterFactory"/> |
||||
* <filter class="solr.ArabicStemFilterFactory"/> |
||||
* </analyzer> |
||||
* </fieldType></pre> |
||||
* |
||||
*/ |
||||
public class ArabicStemFilterFactory extends TokenFilterFactory { |
||||
|
||||
|
||||
public ArabicStemFilter create(TokenStream input) { |
||||
return new ArabicStemFilter(input); |
||||
} |
||||
} |
@ -0,0 +1,150 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.ar; |
||||
|
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import static com.fr.third.org.apache.lucene.analysis.util.StemmerUtil.*; |
||||
|
||||
/** |
||||
* Stemmer for Arabic. |
||||
* <p> |
||||
* Stemming is done in-place for efficiency, operating on a termbuffer. |
||||
* <p> |
||||
* Stemming is defined as: |
||||
* <ul> |
||||
* <li> Removal of attached definite article, conjunction, and prepositions. |
||||
* <li> Stemming of common suffixes. |
||||
* </ul> |
||||
* |
||||
*/ |
||||
public class ArabicStemmer { |
||||
public static final char ALEF = '\u0627'; |
||||
public static final char BEH = '\u0628'; |
||||
public static final char TEH_MARBUTA = '\u0629'; |
||||
public static final char TEH = '\u062A'; |
||||
public static final char FEH = '\u0641'; |
||||
public static final char KAF = '\u0643'; |
||||
public static final char LAM = '\u0644'; |
||||
public static final char NOON = '\u0646'; |
||||
public static final char HEH = '\u0647'; |
||||
public static final char WAW = '\u0648'; |
||||
public static final char YEH = '\u064A'; |
||||
|
||||
public static final char prefixes[][] = { |
||||
("" + ALEF + LAM).toCharArray(), |
||||
("" + WAW + ALEF + LAM).toCharArray(), |
||||
("" + BEH + ALEF + LAM).toCharArray(), |
||||
("" + KAF + ALEF + LAM).toCharArray(), |
||||
("" + FEH + ALEF + LAM).toCharArray(), |
||||
("" + LAM + LAM).toCharArray(), |
||||
("" + WAW).toCharArray(), |
||||
}; |
||||
|
||||
public static final char suffixes[][] = { |
||||
("" + HEH + ALEF).toCharArray(), |
||||
("" + ALEF + NOON).toCharArray(), |
||||
("" + ALEF + TEH).toCharArray(), |
||||
("" + WAW + NOON).toCharArray(), |
||||
("" + YEH + NOON).toCharArray(), |
||||
("" + YEH + HEH).toCharArray(), |
||||
("" + YEH + TEH_MARBUTA).toCharArray(), |
||||
("" + HEH).toCharArray(), |
||||
("" + TEH_MARBUTA).toCharArray(), |
||||
("" + YEH).toCharArray(), |
||||
}; |
||||
|
||||
/** |
||||
* Stem an input buffer of Arabic text. |
||||
* |
||||
* @param s input buffer |
||||
* @param len length of input buffer |
||||
* @return length of input buffer after normalization |
||||
*/ |
||||
public int stem(char s[], int len) { |
||||
len = stemPrefix(s, len); |
||||
len = stemSuffix(s, len); |
||||
|
||||
return len; |
||||
} |
||||
|
||||
/** |
||||
* Stem a prefix off an Arabic word. |
||||
* @param s input buffer |
||||
* @param len length of input buffer |
||||
* @return new length of input buffer after stemming. |
||||
*/ |
||||
public int stemPrefix(char s[], int len) { |
||||
for (int i = 0; i < prefixes.length; i++) |
||||
if (startsWithCheckLength(s, len, prefixes[i])) |
||||
return deleteN(s, 0, len, prefixes[i].length); |
||||
return len; |
||||
} |
||||
|
||||
/** |
||||
* Stem suffix(es) off an Arabic word. |
||||
* @param s input buffer |
||||
* @param len length of input buffer |
||||
* @return new length of input buffer after stemming |
||||
*/ |
||||
public int stemSuffix(char s[], int len) { |
||||
for (int i = 0; i < suffixes.length; i++) |
||||
if (endsWithCheckLength(s, len, suffixes[i])) |
||||
len = deleteN(s, len - suffixes[i].length, len, suffixes[i].length); |
||||
return len; |
||||
} |
||||
|
||||
/** |
||||
* Returns true if the prefix matches and can be stemmed |
||||
* @param s input buffer |
||||
* @param len length of input buffer |
||||
* @param prefix prefix to check |
||||
* @return true if the prefix matches and can be stemmed |
||||
*/ |
||||
boolean startsWithCheckLength(char s[], int len, char prefix[]) { |
||||
if (prefix.length == 1 && len < 4) { // wa- prefix requires at least 3 characters
|
||||
return false; |
||||
} else if (len < prefix.length + 2) { // other prefixes require only 2.
|
||||
return false; |
||||
} else { |
||||
for (int i = 0; i < prefix.length; i++) |
||||
if (s[i] != prefix[i]) |
||||
return false; |
||||
|
||||
return true; |
||||
} |
||||
} |
||||
|
||||
/** |
||||
* Returns true if the suffix matches and can be stemmed |
||||
* @param s input buffer |
||||
* @param len length of input buffer |
||||
* @param suffix suffix to check |
||||
* @return true if the suffix matches and can be stemmed |
||||
*/ |
||||
boolean endsWithCheckLength(char s[], int len, char suffix[]) { |
||||
if (len < suffix.length + 2) { // all suffixes require at least 2 characters after stemming
|
||||
return false; |
||||
} else { |
||||
for (int i = 0; i < suffix.length; i++) |
||||
if (s[len - suffix.length + i] != suffix[i]) |
||||
return false; |
||||
|
||||
return true; |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,22 @@
|
||||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> |
||||
<!-- |
||||
Licensed to the Apache Software Foundation (ASF) under one or more |
||||
contributor license agreements. See the NOTICE file distributed with |
||||
this work for additional information regarding copyright ownership. |
||||
The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
(the "License"); you may not use this file except in compliance with |
||||
the License. You may obtain a copy of the License at |
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
||||
Unless required by applicable law or agreed to in writing, software |
||||
distributed under the License is distributed on an "AS IS" BASIS, |
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
See the License for the specific language governing permissions and |
||||
limitations under the License. |
||||
--> |
||||
<html><head></head> |
||||
<body> |
||||
Analyzer for Arabic. |
||||
</body> |
||||
</html> |
@ -0,0 +1,131 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.bg; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.IOException; |
||||
import java.io.Reader; |
||||
import java.util.Set; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.Analyzer; |
||||
import com.fr.third.org.apache.lucene.analysis.core.LowerCaseFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.core.StopFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.TokenStream; |
||||
import com.fr.third.org.apache.lucene.analysis.Tokenizer; |
||||
import com.fr.third.org.apache.lucene.analysis.standard.StandardFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer; |
||||
import com.fr.third.org.apache.lucene.analysis.util.CharArraySet; |
||||
import com.fr.third.org.apache.lucene.analysis.util.StopwordAnalyzerBase; |
||||
import com.fr.third.org.apache.lucene.util.Version; |
||||
|
||||
/** |
||||
* {@link Analyzer} for Bulgarian. |
||||
* <p> |
||||
* This analyzer implements light-stemming as specified by: <i> Searching |
||||
* Strategies for the Bulgarian Language </i> |
||||
* http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
|
||||
* <p> |
||||
*/ |
||||
public final class BulgarianAnalyzer extends StopwordAnalyzerBase { |
||||
/** |
||||
* File containing default Bulgarian stopwords. |
||||
* |
||||
* Default stopword list is from |
||||
* http://members.unine.ch/jacques.savoy/clef/index.html The stopword list is
|
||||
* BSD-Licensed. |
||||
*/ |
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; |
||||
|
||||
/** |
||||
* Returns an unmodifiable instance of the default stop-words set. |
||||
* |
||||
* @return an unmodifiable instance of the default stop-words set. |
||||
*/ |
||||
public static CharArraySet getDefaultStopSet() { |
||||
return DefaultSetHolder.DEFAULT_STOP_SET; |
||||
} |
||||
|
||||
/** |
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer |
||||
* class accesses the static final set the first time.; |
||||
*/ |
||||
private static class DefaultSetHolder { |
||||
static final CharArraySet DEFAULT_STOP_SET; |
||||
|
||||
static { |
||||
try { |
||||
DEFAULT_STOP_SET = loadStopwordSet(false, BulgarianAnalyzer.class, DEFAULT_STOPWORD_FILE, "#"); |
||||
} catch (IOException ex) { |
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set"); |
||||
} |
||||
} |
||||
} |
||||
|
||||
private final CharArraySet stemExclusionSet; |
||||
|
||||
/** |
||||
* Builds an analyzer with the default stop words: |
||||
* {@link #DEFAULT_STOPWORD_FILE}. |
||||
*/ |
||||
public BulgarianAnalyzer(Version matchVersion) { |
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); |
||||
} |
||||
|
||||
/** |
||||
* Builds an analyzer with the given stop words. |
||||
*/ |
||||
public BulgarianAnalyzer(Version matchVersion, CharArraySet stopwords) { |
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); |
||||
} |
||||
|
||||
/** |
||||
* Builds an analyzer with the given stop words and a stem exclusion set. |
||||
* If a stem exclusion set is provided this analyzer will add a {@link KeywordMarkerFilter} |
||||
* before {@link BulgarianStemFilter}. |
||||
*/ |
||||
public BulgarianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { |
||||
super(matchVersion, stopwords); |
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( |
||||
matchVersion, stemExclusionSet)); } |
||||
|
||||
/** |
||||
* Creates a |
||||
* {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents} |
||||
* which tokenizes all the text in the provided {@link Reader}. |
||||
* |
||||
* @return A |
||||
* {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents} |
||||
* built from an {@link StandardTokenizer} filtered with |
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter} |
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is |
||||
* provided and {@link BulgarianStemFilter}. |
||||
*/ |
||||
@Override |
||||
public TokenStreamComponents createComponents(String fieldName, Reader reader) { |
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader); |
||||
TokenStream result = new StandardFilter(matchVersion, source); |
||||
result = new LowerCaseFilter(matchVersion, result); |
||||
result = new StopFilter(matchVersion, result, stopwords); |
||||
if(!stemExclusionSet.isEmpty()) |
||||
result = new KeywordMarkerFilter(result, stemExclusionSet); |
||||
result = new BulgarianStemFilter(result); |
||||
return new TokenStreamComponents(source, result); |
||||
} |
||||
} |
@ -0,0 +1,58 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.bg; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.IOException; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // for javadoc
|
||||
import com.fr.third.org.apache.lucene.analysis.TokenFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.TokenStream; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.KeywordAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
||||
|
||||
/** |
||||
* A {@link TokenFilter} that applies {@link BulgarianStemmer} to stem Bulgarian |
||||
* words. |
||||
* <p> |
||||
* To prevent terms from being stemmed use an instance of |
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets |
||||
* the {@link KeywordAttribute} before this {@link TokenStream}. |
||||
* </p> |
||||
*/ |
||||
public final class BulgarianStemFilter extends TokenFilter { |
||||
private final BulgarianStemmer stemmer = new BulgarianStemmer(); |
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); |
||||
|
||||
public BulgarianStemFilter(final TokenStream input) { |
||||
super(input); |
||||
} |
||||
|
||||
@Override |
||||
public boolean incrementToken() throws IOException { |
||||
if (input.incrementToken()) { |
||||
if(!keywordAttr.isKeyword()) { |
||||
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); |
||||
termAtt.setLength(newlen); |
||||
} |
||||
return true; |
||||
} else { |
||||
return false; |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,40 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.bg; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.TokenStream; |
||||
import com.fr.third.org.apache.lucene.analysis.bg.BulgarianStemFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory; |
||||
|
||||
/** |
||||
* Factory for {@link BulgarianStemFilter}. |
||||
* <pre class="prettyprint" > |
||||
* <fieldType name="text_bgstem" class="solr.TextField" positionIncrementGap="100"> |
||||
* <analyzer> |
||||
* <tokenizer class="solr.StandardTokenizerFactory"/> |
||||
* <filter class="solr.LowerCaseFilterFactory"/> |
||||
* <filter class="solr.BulgarianStemFilterFactory"/> |
||||
* </analyzer> |
||||
* </fieldType></pre> |
||||
* |
||||
*/ |
||||
public class BulgarianStemFilterFactory extends TokenFilterFactory { |
||||
public TokenStream create(TokenStream input) { |
||||
return new BulgarianStemFilter(input); |
||||
} |
||||
} |
@ -0,0 +1,143 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.bg; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import static com.fr.third.org.apache.lucene.analysis.util.StemmerUtil.*; |
||||
|
||||
/** |
||||
* Light Stemmer for Bulgarian. |
||||
* <p> |
||||
* Implements the algorithm described in: |
||||
* <i> |
||||
* Searching Strategies for the Bulgarian Language |
||||
* </i> |
||||
* http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
|
||||
*/ |
||||
public class BulgarianStemmer { |
||||
|
||||
/** |
||||
* Stem an input buffer of Bulgarian text. |
||||
* |
||||
* @param s input buffer |
||||
* @param len length of input buffer |
||||
* @return length of input buffer after normalization |
||||
*/ |
||||
public int stem(final char s[], int len) { |
||||
if (len < 4) // do not stem
|
||||
return len; |
||||
|
||||
if (len > 5 && endsWith(s, len, "ища")) |
||||
return len - 3; |
||||
|
||||
len = removeArticle(s, len); |
||||
len = removePlural(s, len); |
||||
|
||||
if (len > 3) { |
||||
if (endsWith(s, len, "я")) |
||||
len--; |
||||
if (endsWith(s, len, "а") || |
||||
endsWith(s, len, "о") || |
||||
endsWith(s, len, "е")) |
||||
len--; |
||||
} |
||||
|
||||
// the rule to rewrite ен -> н is duplicated in the paper.
|
||||
// in the perl implementation referenced by the paper, this is fixed.
|
||||
// (it is fixed here as well)
|
||||
if (len > 4 && endsWith(s, len, "ен")) { |
||||
s[len - 2] = 'н'; // replace with н
|
||||
len--; |
||||
} |
||||
|
||||
if (len > 5 && s[len - 2] == 'ъ') { |
||||
s[len - 2] = s[len - 1]; // replace ъN with N
|
||||
len--; |
||||
} |
||||
|
||||
return len; |
||||
} |
||||
|
||||
/** |
||||
* Mainly remove the definite article |
||||
* @param s input buffer |
||||
* @param len length of input buffer |
||||
* @return new stemmed length |
||||
*/ |
||||
private int removeArticle(final char s[], final int len) { |
||||
if (len > 6 && endsWith(s, len, "ият")) |
||||
return len - 3; |
||||
|
||||
if (len > 5) { |
||||
if (endsWith(s, len, "ът") || |
||||
endsWith(s, len, "то") || |
||||
endsWith(s, len, "те") || |
||||
endsWith(s, len, "та") || |
||||
endsWith(s, len, "ия")) |
||||
return len - 2; |
||||
} |
||||
|
||||
if (len > 4 && endsWith(s, len, "ят")) |
||||
return len - 2; |
||||
|
||||
return len; |
||||
} |
||||
|
||||
private int removePlural(final char s[], final int len) { |
||||
if (len > 6) { |
||||
if (endsWith(s, len, "овци")) |
||||
return len - 3; // replace with о
|
||||
if (endsWith(s, len, "ове")) |
||||
return len - 3; |
||||
if (endsWith(s, len, "еве")) { |
||||
s[len - 3] = 'й'; // replace with й
|
||||
return len - 2; |
||||
} |
||||
} |
||||
|
||||
if (len > 5) { |
||||
if (endsWith(s, len, "ища")) |
||||
return len - 3; |
||||
if (endsWith(s, len, "та")) |
||||
return len - 2; |
||||
if (endsWith(s, len, "ци")) { |
||||
s[len - 2] = 'к'; // replace with к
|
||||
return len - 1; |
||||
} |
||||
if (endsWith(s, len, "зи")) { |
||||
s[len - 2] = 'г'; // replace with г
|
||||
return len - 1; |
||||
} |
||||
|
||||
if (s[len - 3] == 'е' && s[len - 1] == 'и') { |
||||
s[len - 3] = 'я'; // replace е with я, remove и
|
||||
return len - 1; |
||||
} |
||||
} |
||||
|
||||
if (len > 4) { |
||||
if (endsWith(s, len, "си")) { |
||||
s[len - 2] = 'х'; // replace with х
|
||||
return len - 1; |
||||
} |
||||
if (endsWith(s, len, "и")) |
||||
return len - 1; |
||||
} |
||||
|
||||
return len; |
||||
} |
||||
} |
@ -0,0 +1,22 @@
|
||||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> |
||||
<!-- |
||||
Licensed to the Apache Software Foundation (ASF) under one or more |
||||
contributor license agreements. See the NOTICE file distributed with |
||||
this work for additional information regarding copyright ownership. |
||||
The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
(the "License"); you may not use this file except in compliance with |
||||
the License. You may obtain a copy of the License at |
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
||||
Unless required by applicable law or agreed to in writing, software |
||||
distributed under the License is distributed on an "AS IS" BASIS, |
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
See the License for the specific language governing permissions and |
||||
limitations under the License. |
||||
--> |
||||
<html><head></head> |
||||
<body> |
||||
Analyzer for Bulgarian. |
||||
</body> |
||||
</html> |
@ -0,0 +1,138 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.br; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.IOException; |
||||
import java.io.Reader; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.Analyzer; |
||||
import com.fr.third.org.apache.lucene.analysis.TokenStream; |
||||
import com.fr.third.org.apache.lucene.analysis.Tokenizer; |
||||
import com.fr.third.org.apache.lucene.analysis.core.LowerCaseFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.core.StopFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.standard.StandardAnalyzer; |
||||
import com.fr.third.org.apache.lucene.analysis.standard.StandardFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer; |
||||
import com.fr.third.org.apache.lucene.analysis.util.CharArraySet; |
||||
import com.fr.third.org.apache.lucene.analysis.util.StopwordAnalyzerBase; |
||||
import com.fr.third.org.apache.lucene.analysis.util.WordlistLoader; |
||||
import com.fr.third.org.apache.lucene.util.IOUtils; |
||||
import com.fr.third.org.apache.lucene.util.Version; |
||||
|
||||
/** |
||||
* {@link Analyzer} for Brazilian Portuguese language. |
||||
* <p> |
||||
* Supports an external list of stopwords (words that |
||||
* will not be indexed at all) and an external list of exclusions (words that will |
||||
* not be stemmed, but indexed). |
||||
* </p> |
||||
* |
||||
* <p><b>NOTE</b>: This class uses the same {@link Version} |
||||
* dependent settings as {@link StandardAnalyzer}.</p> |
||||
*/ |
||||
public final class BrazilianAnalyzer extends StopwordAnalyzerBase { |
||||
/** File containing default Brazilian Portuguese stopwords. */ |
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; |
||||
|
||||
/** |
||||
* Returns an unmodifiable instance of the default stop-words set. |
||||
* @return an unmodifiable instance of the default stop-words set. |
||||
*/ |
||||
public static CharArraySet getDefaultStopSet(){ |
||||
return DefaultSetHolder.DEFAULT_STOP_SET; |
||||
} |
||||
|
||||
private static class DefaultSetHolder { |
||||
static final CharArraySet DEFAULT_STOP_SET; |
||||
|
||||
static { |
||||
try { |
||||
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(BrazilianAnalyzer.class, |
||||
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT); |
||||
} catch (IOException ex) { |
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set"); |
||||
} |
||||
} |
||||
} |
||||
|
||||
|
||||
/** |
||||
* Contains words that should be indexed but not stemmed. |
||||
*/ |
||||
private CharArraySet excltable = CharArraySet.EMPTY_SET; |
||||
|
||||
/** |
||||
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}). |
||||
*/ |
||||
public BrazilianAnalyzer(Version matchVersion) { |
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); |
||||
} |
||||
|
||||
/** |
||||
* Builds an analyzer with the given stop words |
||||
* |
||||
* @param matchVersion |
||||
* lucene compatibility version |
||||
* @param stopwords |
||||
* a stopword set |
||||
*/ |
||||
public BrazilianAnalyzer(Version matchVersion, CharArraySet stopwords) { |
||||
super(matchVersion, stopwords); |
||||
} |
||||
|
||||
/** |
||||
* Builds an analyzer with the given stop words and stemming exclusion words |
||||
* |
||||
* @param matchVersion |
||||
* lucene compatibility version |
||||
* @param stopwords |
||||
* a stopword set |
||||
*/ |
||||
public BrazilianAnalyzer(Version matchVersion, CharArraySet stopwords, |
||||
CharArraySet stemExclusionSet) { |
||||
this(matchVersion, stopwords); |
||||
excltable = CharArraySet.unmodifiableSet(CharArraySet |
||||
.copy(matchVersion, stemExclusionSet)); |
||||
} |
||||
|
||||
/** |
||||
* Creates |
||||
* {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents} |
||||
* used to tokenize all the text in the provided {@link Reader}. |
||||
* |
||||
* @return {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents} |
||||
* built from a {@link StandardTokenizer} filtered with |
||||
* {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter} |
||||
* , and {@link BrazilianStemFilter}. |
||||
*/ |
||||
@Override |
||||
protected TokenStreamComponents createComponents(String fieldName, |
||||
Reader reader) { |
||||
Tokenizer source = new StandardTokenizer(matchVersion, reader); |
||||
TokenStream result = new LowerCaseFilter(matchVersion, source); |
||||
result = new StandardFilter(matchVersion, result); |
||||
result = new StopFilter(matchVersion, result, stopwords); |
||||
if(excltable != null && !excltable.isEmpty()) |
||||
result = new KeywordMarkerFilter(result, excltable); |
||||
return new TokenStreamComponents(source, new BrazilianStemFilter(result)); |
||||
} |
||||
} |
||||
|
@ -0,0 +1,76 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.br; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.IOException; |
||||
import java.util.Set; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.TokenFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.TokenStream; |
||||
import com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.KeywordAttribute; |
||||
|
||||
/** |
||||
* A {@link TokenFilter} that applies {@link BrazilianStemmer}. |
||||
* <p> |
||||
* To prevent terms from being stemmed use an instance of |
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets |
||||
* the {@link KeywordAttribute} before this {@link TokenStream}. |
||||
* </p> |
||||
* @see KeywordMarkerFilter |
||||
* |
||||
*/ |
||||
public final class BrazilianStemFilter extends TokenFilter { |
||||
|
||||
/** |
||||
* {@link BrazilianStemmer} in use by this filter. |
||||
*/ |
||||
private BrazilianStemmer stemmer = new BrazilianStemmer(); |
||||
private Set<?> exclusions = null; |
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); |
||||
|
||||
/** |
||||
* Creates a new BrazilianStemFilter |
||||
* |
||||
* @param in the source {@link TokenStream} |
||||
*/ |
||||
public BrazilianStemFilter(TokenStream in) { |
||||
super(in); |
||||
} |
||||
|
||||
@Override |
||||
public boolean incrementToken() throws IOException { |
||||
if (input.incrementToken()) { |
||||
final String term = termAtt.toString(); |
||||
// Check the exclusion table.
|
||||
if (!keywordAttr.isKeyword() && (exclusions == null || !exclusions.contains(term))) { |
||||
final String s = stemmer.stem(term); |
||||
// If not stemmed, don't waste the time adjusting the token.
|
||||
if ((s != null) && !s.equals(term)) |
||||
termAtt.setEmpty().append(s); |
||||
} |
||||
return true; |
||||
} else { |
||||
return false; |
||||
} |
||||
} |
||||
} |
||||
|
||||
|
@ -0,0 +1,41 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.br; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.TokenStream; |
||||
import com.fr.third.org.apache.lucene.analysis.br.BrazilianStemFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory; |
||||
|
||||
/** |
||||
* Factory for {@link BrazilianStemFilter}. |
||||
* <pre class="prettyprint" > |
||||
* <fieldType name="text_brstem" class="solr.TextField" positionIncrementGap="100"> |
||||
* <analyzer> |
||||
* <tokenizer class="solr.StandardTokenizerFactory"/> |
||||
* <filter class="solr.LowerCaseFilterFactory"/> |
||||
* <filter class="solr.BrazilianStemFilterFactory"/> |
||||
* </analyzer> |
||||
* </fieldType></pre> |
||||
* |
||||
*/ |
||||
public class BrazilianStemFilterFactory extends TokenFilterFactory { |
||||
public BrazilianStemFilter create(TokenStream in) { |
||||
return new BrazilianStemFilter(in); |
||||
} |
||||
} |
||||
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,22 @@
|
||||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> |
||||
<!-- |
||||
Licensed to the Apache Software Foundation (ASF) under one or more |
||||
contributor license agreements. See the NOTICE file distributed with |
||||
this work for additional information regarding copyright ownership. |
||||
The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
(the "License"); you may not use this file except in compliance with |
||||
the License. You may obtain a copy of the License at |
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
||||
Unless required by applicable law or agreed to in writing, software |
||||
distributed under the License is distributed on an "AS IS" BASIS, |
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
See the License for the specific language governing permissions and |
||||
limitations under the License. |
||||
--> |
||||
<html><head></head> |
||||
<body> |
||||
Analyzer for Brazilian Portuguese. |
||||
</body> |
||||
</html> |
@ -0,0 +1,148 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.ca; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.IOException; |
||||
import java.io.Reader; |
||||
import java.util.Arrays; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.Analyzer; |
||||
import com.fr.third.org.apache.lucene.analysis.core.LowerCaseFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.core.StopFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.TokenStream; |
||||
import com.fr.third.org.apache.lucene.analysis.Tokenizer; |
||||
import com.fr.third.org.apache.lucene.analysis.snowball.SnowballFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.standard.StandardFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer; |
||||
import com.fr.third.org.apache.lucene.analysis.util.CharArraySet; |
||||
import com.fr.third.org.apache.lucene.analysis.util.ElisionFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.util.StopwordAnalyzerBase; |
||||
import com.fr.third.org.apache.lucene.util.Version; |
||||
import com.fr.third.org.tartarus.snowball.ext.CatalanStemmer; |
||||
|
||||
/** |
||||
* {@link Analyzer} for Catalan. |
||||
* <p> |
||||
* <a name="version"/> |
||||
* <p>You must specify the required {@link Version} |
||||
* compatibility when creating CatalanAnalyzer: |
||||
* <ul> |
||||
* <li> As of 3.6, ElisionFilter with a set of Catalan |
||||
* contractions is used by default. |
||||
* </ul> |
||||
*/ |
||||
public final class CatalanAnalyzer extends StopwordAnalyzerBase { |
||||
private final CharArraySet stemExclusionSet; |
||||
|
||||
/** File containing default Catalan stopwords. */ |
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; |
||||
|
||||
private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet( |
||||
new CharArraySet(Version.LUCENE_CURRENT, |
||||
Arrays.asList( |
||||
"d", "l", "m", "n", "s", "t" |
||||
), true)); |
||||
|
||||
/** |
||||
* Returns an unmodifiable instance of the default stop words set. |
||||
* @return default stop words set. |
||||
*/ |
||||
public static CharArraySet getDefaultStopSet(){ |
||||
return DefaultSetHolder.DEFAULT_STOP_SET; |
||||
} |
||||
|
||||
/** |
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
|
||||
* accesses the static final set the first time.; |
||||
*/ |
||||
private static class DefaultSetHolder { |
||||
static final CharArraySet DEFAULT_STOP_SET; |
||||
|
||||
static { |
||||
try { |
||||
DEFAULT_STOP_SET = loadStopwordSet(false, |
||||
CatalanAnalyzer.class, DEFAULT_STOPWORD_FILE, "#"); |
||||
} catch (IOException ex) { |
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set"); |
||||
} |
||||
} |
||||
} |
||||
|
||||
/** |
||||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. |
||||
*/ |
||||
public CatalanAnalyzer(Version matchVersion) { |
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); |
||||
} |
||||
|
||||
/** |
||||
* Builds an analyzer with the given stop words. |
||||
* |
||||
* @param matchVersion lucene compatibility version |
||||
* @param stopwords a stopword set |
||||
*/ |
||||
public CatalanAnalyzer(Version matchVersion, CharArraySet stopwords) { |
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET); |
||||
} |
||||
|
||||
/** |
||||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is |
||||
* provided this analyzer will add a {@link KeywordMarkerFilter} before |
||||
* stemming. |
||||
* |
||||
* @param matchVersion lucene compatibility version |
||||
* @param stopwords a stopword set |
||||
* @param stemExclusionSet a set of terms not to be stemmed |
||||
*/ |
||||
public CatalanAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { |
||||
super(matchVersion, stopwords); |
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( |
||||
matchVersion, stemExclusionSet)); |
||||
} |
||||
|
||||
/** |
||||
* Creates a |
||||
* {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents} |
||||
* which tokenizes all the text in the provided {@link Reader}. |
||||
* |
||||
* @return A |
||||
* {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents} |
||||
* built from an {@link StandardTokenizer} filtered with |
||||
* {@link StandardFilter}, {@link ElisionFilter}, {@link LowerCaseFilter}, |
||||
* {@link StopFilter}, {@link KeywordMarkerFilter} if a stem exclusion set is |
||||
* provided and {@link SnowballFilter}. |
||||
*/ |
||||
@Override |
||||
protected TokenStreamComponents createComponents(String fieldName, |
||||
Reader reader) { |
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader); |
||||
TokenStream result = new StandardFilter(matchVersion, source); |
||||
if (matchVersion.onOrAfter(Version.LUCENE_36)) { |
||||
result = new ElisionFilter(result, DEFAULT_ARTICLES); |
||||
} |
||||
result = new LowerCaseFilter(matchVersion, result); |
||||
result = new StopFilter(matchVersion, result, stopwords); |
||||
if(!stemExclusionSet.isEmpty()) |
||||
result = new KeywordMarkerFilter(result, stemExclusionSet); |
||||
result = new SnowballFilter(result, new CatalanStemmer()); |
||||
return new TokenStreamComponents(source, result); |
||||
} |
||||
} |
@ -0,0 +1,22 @@
|
||||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> |
||||
<!-- |
||||
Licensed to the Apache Software Foundation (ASF) under one or more |
||||
contributor license agreements. See the NOTICE file distributed with |
||||
this work for additional information regarding copyright ownership. |
||||
The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
(the "License"); you may not use this file except in compliance with |
||||
the License. You may obtain a copy of the License at |
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
||||
Unless required by applicable law or agreed to in writing, software |
||||
distributed under the License is distributed on an "AS IS" BASIS, |
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
See the License for the specific language governing permissions and |
||||
limitations under the License. |
||||
--> |
||||
<html><head></head> |
||||
<body> |
||||
Analyzer for Catalan. |
||||
</body> |
||||
</html> |
@ -0,0 +1,110 @@
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package com.fr.third.org.apache.lucene.analysis.charfilter; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.CharFilter; |
||||
import com.fr.third.org.apache.lucene.util.ArrayUtil; |
||||
|
||||
import java.io.Reader; |
||||
import java.util.Arrays; |
||||
|
||||
/** |
||||
* Base utility class for implementing a {@link CharFilter}. |
||||
* You subclass this, and then record mappings by calling |
||||
* {@link #addOffCorrectMap}, and then invoke the correct |
||||
* method to correct an offset. |
||||
*/ |
||||
public abstract class BaseCharFilter extends CharFilter { |
||||
|
||||
private int offsets[]; |
||||
private int diffs[]; |
||||
private int size = 0; |
||||
|
||||
public BaseCharFilter(Reader in) { |
||||
super(in); |
||||
} |
||||
|
||||
/** Retrieve the corrected offset. */ |
||||
@Override |
||||
protected int correct(int currentOff) { |
||||
if (offsets == null || currentOff < offsets[0]) { |
||||
return currentOff; |
||||
} |
||||
|
||||
int hi = size - 1; |
||||
if(currentOff >= offsets[hi]) |
||||
return currentOff + diffs[hi]; |
||||
|
||||
int lo = 0; |
||||
int mid = -1; |
||||
|
||||
while (hi >= lo) { |
||||
mid = (lo + hi) >>> 1; |
||||
if (currentOff < offsets[mid]) |
||||
hi = mid - 1; |
||||
else if (currentOff > offsets[mid]) |
||||
lo = mid + 1; |
||||
else |
||||
return currentOff + diffs[mid]; |
||||
} |
||||
|
||||
if (currentOff < offsets[mid]) |
||||
return mid == 0 ? currentOff : currentOff + diffs[mid-1]; |
||||
else |
||||
return currentOff + diffs[mid]; |
||||
} |
||||
|
||||
protected int getLastCumulativeDiff() { |
||||
return offsets == null ? |
||||
0 : diffs[size-1]; |
||||
} |
||||
|
||||
/** |
||||
* <p> |
||||
* Adds an offset correction mapping at the given output stream offset. |
||||
* </p> |
||||
* <p> |
||||
* Assumption: the offset given with each successive call to this method |
||||
* will not be smaller than the offset given at the previous invocation. |
||||
* </p> |
||||
* |
||||
* @param off The output stream offset at which to apply the correction |
||||
* @param cumulativeDiff The input offset is given by adding this |
||||
* to the output offset |
||||
*/ |
||||
protected void addOffCorrectMap(int off, int cumulativeDiff) { |
||||
if (offsets == null) { |
||||
offsets = new int[64]; |
||||
diffs = new int[64]; |
||||
} else if (size == offsets.length) { |
||||
offsets = ArrayUtil.grow(offsets); |
||||
diffs = ArrayUtil.grow(diffs); |
||||
} |
||||
|
||||
assert (size == 0 || off >= offsets[size - 1]) |
||||
: "Offset #" + size + "(" + off + ") is less than the last recorded offset " |
||||
+ offsets[size - 1] + "\n" + Arrays.toString(offsets) + "\n" + Arrays.toString(diffs); |
||||
|
||||
if (size == 0 || off != offsets[size - 1]) { |
||||
offsets[size] = off; |
||||
diffs[size++] = cumulativeDiff; |
||||
} else { // Overwrite the diff at the last recorded offset
|
||||
diffs[size - 1] = cumulativeDiff; |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,162 @@
|
||||
/** |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0 |
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
|
||||
CharacterEntities = ( "AElig" | "Aacute" | "Acirc" | "Agrave" | "Alpha" |
||||
| "Aring" | "Atilde" | "Auml" | "Beta" | "Ccedil" | "Chi" |
||||
| "Dagger" | "Delta" | "ETH" | "Eacute" | "Ecirc" |
||||
| "Egrave" | "Epsilon" | "Eta" | "Euml" | "Gamma" |
||||
| "Iacute" | "Icirc" | "Igrave" | "Iota" | "Iuml" | "Kappa" |
||||
| "Lambda" | "Mu" | "Ntilde" | "Nu" | "OElig" | "Oacute" |
||||
| "Ocirc" | "Ograve" | "Omega" | "Omicron" | "Oslash" |
||||
| "Otilde" | "Ouml" | "Phi" | "Pi" | "Prime" | "Psi" |
||||
| "Rho" | "Scaron" | "Sigma" | "THORN" | "Tau" | "Theta" |
||||
| "Uacute" | "Ucirc" | "Ugrave" | "Upsilon" | "Uuml" | "Xi" |
||||
| "Yacute" | "Yuml" | "Zeta" | "aacute" | "acirc" | "acute" |
||||
| "aelig" | "agrave" | "alefsym" | "alpha" | "amp" | "AMP" |
||||
| "and" | "ang" | "apos" | "aring" | "asymp" | "atilde" |
||||
| "auml" | "bdquo" | "beta" | "brvbar" | "bull" | "cap" |
||||
| "ccedil" | "cedil" | "cent" | "chi" | "circ" | "clubs" |
||||
| "cong" | "copy" | "COPY" | "crarr" | "cup" | "curren" |
||||
| "dArr" | "dagger" | "darr" | "deg" | "delta" | "diams" |
||||
| "divide" | "eacute" | "ecirc" | "egrave" | "empty" |
||||
| "emsp" | "ensp" | "epsilon" | "equiv" | "eta" | "eth" |
||||
| "euml" | "euro" | "exist" | "fnof" | "forall" | "frac12" |
||||
| "frac14" | "frac34" | "frasl" | "gamma" | "ge" | "gt" |
||||
| "GT" | "hArr" | "harr" | "hearts" | "hellip" | "iacute" |
||||
| "icirc" | "iexcl" | "igrave" | "image" | "infin" | "int" |
||||
| "iota" | "iquest" | "isin" | "iuml" | "kappa" | "lArr" |
||||
| "lambda" | "lang" | "laquo" | "larr" | "lceil" | "ldquo" |
||||
| "le" | "lfloor" | "lowast" | "loz" | "lrm" | "lsaquo" |
||||
| "lsquo" | "lt" | "LT" | "macr" | "mdash" | "micro" |
||||
| "middot" | "minus" | "mu" | "nabla" | "nbsp" | "ndash" |
||||
| "ne" | "ni" | "not" | "notin" | "nsub" | "ntilde" | "nu" |
||||
| "oacute" | "ocirc" | "oelig" | "ograve" | "oline" |
||||
| "omega" | "omicron" | "oplus" | "or" | "ordf" | "ordm" |
||||
| "oslash" | "otilde" | "otimes" | "ouml" | "para" | "part" |
||||
| "permil" | "perp" | "phi" | "pi" | "piv" | "plusmn" |
||||
| "pound" | "prime" | "prod" | "prop" | "psi" | "quot" |
||||
| "QUOT" | "rArr" | "radic" | "rang" | "raquo" | "rarr" |
||||
| "rceil" | "rdquo" | "real" | "reg" | "REG" | "rfloor" |
||||
| "rho" | "rlm" | "rsaquo" | "rsquo" | "sbquo" | "scaron" |
||||
| "sdot" | "sect" | "shy" | "sigma" | "sigmaf" | "sim" |
||||
| "spades" | "sub" | "sube" | "sum" | "sup" | "sup1" |
||||
| "sup2" | "sup3" | "supe" | "szlig" | "tau" | "there4" |
||||
| "theta" | "thetasym" | "thinsp" | "thorn" | "tilde" |
||||
| "times" | "trade" | "uArr" | "uacute" | "uarr" | "ucirc" |
||||
| "ugrave" | "uml" | "upsih" | "upsilon" | "uuml" |
||||
| "weierp" | "xi" | "yacute" | "yen" | "yuml" | "zeta" |
||||
| "zwj" | "zwnj" ) |
||||
%{ |
||||
private static final Map<String,String> upperCaseVariantsAccepted |
||||
= new HashMap<String,String>(); |
||||
static { |
||||
upperCaseVariantsAccepted.put("quot", "QUOT"); |
||||
upperCaseVariantsAccepted.put("copy", "COPY"); |
||||
upperCaseVariantsAccepted.put("gt", "GT"); |
||||
upperCaseVariantsAccepted.put("lt", "LT"); |
||||
upperCaseVariantsAccepted.put("reg", "REG"); |
||||
upperCaseVariantsAccepted.put("amp", "AMP"); |
||||
} |
||||
private static final CharArrayMap<Character> entityValues |
||||
= new CharArrayMap<Character>(Version.LUCENE_40, 253, false); |
||||
static { |
||||
String[] entities = { |
||||
"AElig", "\u00C6", "Aacute", "\u00C1", "Acirc", "\u00C2", |
||||
"Agrave", "\u00C0", "Alpha", "\u0391", "Aring", "\u00C5", |
||||
"Atilde", "\u00C3", "Auml", "\u00C4", "Beta", "\u0392", |
||||
"Ccedil", "\u00C7", "Chi", "\u03A7", "Dagger", "\u2021", |
||||
"Delta", "\u0394", "ETH", "\u00D0", "Eacute", "\u00C9", |
||||
"Ecirc", "\u00CA", "Egrave", "\u00C8", "Epsilon", "\u0395", |
||||
"Eta", "\u0397", "Euml", "\u00CB", "Gamma", "\u0393", "Iacute", "\u00CD", |
||||
"Icirc", "\u00CE", "Igrave", "\u00CC", "Iota", "\u0399", |
||||
"Iuml", "\u00CF", "Kappa", "\u039A", "Lambda", "\u039B", "Mu", "\u039C", |
||||
"Ntilde", "\u00D1", "Nu", "\u039D", "OElig", "\u0152", |
||||
"Oacute", "\u00D3", "Ocirc", "\u00D4", "Ograve", "\u00D2", |
||||
"Omega", "\u03A9", "Omicron", "\u039F", "Oslash", "\u00D8", |
||||
"Otilde", "\u00D5", "Ouml", "\u00D6", "Phi", "\u03A6", "Pi", "\u03A0", |
||||
"Prime", "\u2033", "Psi", "\u03A8", "Rho", "\u03A1", "Scaron", "\u0160", |
||||
"Sigma", "\u03A3", "THORN", "\u00DE", "Tau", "\u03A4", "Theta", "\u0398", |
||||
"Uacute", "\u00DA", "Ucirc", "\u00DB", "Ugrave", "\u00D9", |
||||
"Upsilon", "\u03A5", "Uuml", "\u00DC", "Xi", "\u039E", |
||||
"Yacute", "\u00DD", "Yuml", "\u0178", "Zeta", "\u0396", |
||||
"aacute", "\u00E1", "acirc", "\u00E2", "acute", "\u00B4", |
||||
"aelig", "\u00E6", "agrave", "\u00E0", "alefsym", "\u2135", |
||||
"alpha", "\u03B1", "amp", "\u0026", "and", "\u2227", "ang", "\u2220", |
||||
"apos", "\u0027", "aring", "\u00E5", "asymp", "\u2248", |
||||
"atilde", "\u00E3", "auml", "\u00E4", "bdquo", "\u201E", |
||||
"beta", "\u03B2", "brvbar", "\u00A6", "bull", "\u2022", "cap", "\u2229", |
||||
"ccedil", "\u00E7", "cedil", "\u00B8", "cent", "\u00A2", "chi", "\u03C7", |
||||
"circ", "\u02C6", "clubs", "\u2663", "cong", "\u2245", "copy", "\u00A9", |
||||
"crarr", "\u21B5", "cup", "\u222A", "curren", "\u00A4", "dArr", "\u21D3", |
||||
"dagger", "\u2020", "darr", "\u2193", "deg", "\u00B0", "delta", "\u03B4", |
||||
"diams", "\u2666", "divide", "\u00F7", "eacute", "\u00E9", |
||||
"ecirc", "\u00EA", "egrave", "\u00E8", "empty", "\u2205", |
||||
"emsp", "\u2003", "ensp", "\u2002", "epsilon", "\u03B5", |
||||
"equiv", "\u2261", "eta", "\u03B7", "eth", "\u00F0", "euml", "\u00EB", |
||||
"euro", "\u20AC", "exist", "\u2203", "fnof", "\u0192", |
||||
"forall", "\u2200", "frac12", "\u00BD", "frac14", "\u00BC", |
||||
"frac34", "\u00BE", "frasl", "\u2044", "gamma", "\u03B3", "ge", "\u2265", |
||||
"gt", "\u003E", "hArr", "\u21D4", "harr", "\u2194", "hearts", "\u2665", |
||||
"hellip", "\u2026", "iacute", "\u00ED", "icirc", "\u00EE", |
||||
"iexcl", "\u00A1", "igrave", "\u00EC", "image", "\u2111", |
||||
"infin", "\u221E", "int", "\u222B", "iota", "\u03B9", "iquest", "\u00BF", |
||||
"isin", "\u2208", "iuml", "\u00EF", "kappa", "\u03BA", "lArr", "\u21D0", |
||||
"lambda", "\u03BB", "lang", "\u2329", "laquo", "\u00AB", |
||||
"larr", "\u2190", "lceil", "\u2308", "ldquo", "\u201C", "le", "\u2264", |
||||
"lfloor", "\u230A", "lowast", "\u2217", "loz", "\u25CA", "lrm", "\u200E", |
||||
"lsaquo", "\u2039", "lsquo", "\u2018", "lt", "\u003C", "macr", "\u00AF", |
||||
"mdash", "\u2014", "micro", "\u00B5", "middot", "\u00B7", |
||||
"minus", "\u2212", "mu", "\u03BC", "nabla", "\u2207", "nbsp", " ", |
||||
"ndash", "\u2013", "ne", "\u2260", "ni", "\u220B", "not", "\u00AC", |
||||
"notin", "\u2209", "nsub", "\u2284", "ntilde", "\u00F1", "nu", "\u03BD", |
||||
"oacute", "\u00F3", "ocirc", "\u00F4", "oelig", "\u0153", |
||||
"ograve", "\u00F2", "oline", "\u203E", "omega", "\u03C9", |
||||
"omicron", "\u03BF", "oplus", "\u2295", "or", "\u2228", "ordf", "\u00AA", |
||||
"ordm", "\u00BA", "oslash", "\u00F8", "otilde", "\u00F5", |
||||
"otimes", "\u2297", "ouml", "\u00F6", "para", "\u00B6", "part", "\u2202", |
||||
"permil", "\u2030", "perp", "\u22A5", "phi", "\u03C6", "pi", "\u03C0", |
||||
"piv", "\u03D6", "plusmn", "\u00B1", "pound", "\u00A3", |
||||
"prime", "\u2032", "prod", "\u220F", "prop", "\u221D", "psi", "\u03C8", |
||||
"quot", "\"", "rArr", "\u21D2", "radic", "\u221A", "rang", "\u232A", |
||||
"raquo", "\u00BB", "rarr", "\u2192", "rceil", "\u2309", |
||||
"rdquo", "\u201D", "real", "\u211C", "reg", "\u00AE", "rfloor", "\u230B", |
||||
"rho", "\u03C1", "rlm", "\u200F", "rsaquo", "\u203A", "rsquo", "\u2019", |
||||
"sbquo", "\u201A", "scaron", "\u0161", "sdot", "\u22C5", |
||||
"sect", "\u00A7", "shy", "\u00AD", "sigma", "\u03C3", "sigmaf", "\u03C2", |
||||
"sim", "\u223C", "spades", "\u2660", "sub", "\u2282", "sube", "\u2286", |
||||
"sum", "\u2211", "sup", "\u2283", "sup1", "\u00B9", "sup2", "\u00B2", |
||||
"sup3", "\u00B3", "supe", "\u2287", "szlig", "\u00DF", "tau", "\u03C4", |
||||
"there4", "\u2234", "theta", "\u03B8", "thetasym", "\u03D1", |
||||
"thinsp", "\u2009", "thorn", "\u00FE", "tilde", "\u02DC", |
||||
"times", "\u00D7", "trade", "\u2122", "uArr", "\u21D1", |
||||
"uacute", "\u00FA", "uarr", "\u2191", "ucirc", "\u00FB", |
||||
"ugrave", "\u00F9", "uml", "\u00A8", "upsih", "\u03D2", |
||||
"upsilon", "\u03C5", "uuml", "\u00FC", "weierp", "\u2118", |
||||
"xi", "\u03BE", "yacute", "\u00FD", "yen", "\u00A5", "yuml", "\u00FF", |
||||
"zeta", "\u03B6", "zwj", "\u200D", "zwnj", "\u200C" |
||||
}; |
||||
for (int i = 0 ; i < entities.length ; i += 2) { |
||||
Character value = entities[i + 1].charAt(0); |
||||
entityValues.put(entities[i], value); |
||||
String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]); |
||||
if (upperCaseVariant != null) { |
||||
entityValues.put(upperCaseVariant, value); |
||||
} |
||||
} |
||||
} |
||||
%} |
@ -0,0 +1,64 @@
|
||||
/* |
||||
* Copyright 2010 The Apache Software Foundation. |
||||
* |
||||
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||
* you may not use this file except in compliance with the License. |
||||
* You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0 |
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
// Generated using ICU4J 49.1.0.0 on Sunday, July 15, 2012 5:42:00 AM UTC |
||||
// by com.fr.third.org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros |
||||
|
||||
|
||||
ID_Start_Supp = ( |
||||
[\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDD80-\uDDB7\uDDBE\uDDBF\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72] |
||||
| [\uD81A][\uDC00-\uDE38] |
||||
| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF] |
||||
| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB] |
||||
| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF] |
||||
| [\uD82C][\uDC00\uDC01] |
||||
| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF] |
||||
| [\uD81B][\uDF00-\uDF44\uDF50\uDF93-\uDF9F] |
||||
| [\uD87E][\uDC00-\uDE1D] |
||||
| [\uD804][\uDC03-\uDC37\uDC83-\uDCAF\uDCD0-\uDCE8\uDD03-\uDD26\uDD83-\uDDB2\uDDC1-\uDDC4] |
||||
| [\uD83B][\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB] |
||||
| [\uD809][\uDC00-\uDC62] |
||||
| [\uD808][\uDC00-\uDF6E] |
||||
| [\uD803][\uDC00-\uDC48] |
||||
| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5] |
||||
| [\uD80D][\uDC00-\uDC2E] |
||||
| [\uD805][\uDE80-\uDEAA] |
||||
| [\uD86E][\uDC00-\uDC1D] |
||||
| [\uD801][\uDC00-\uDC9D] |
||||
) |
||||
ID_Continue_Supp = ( |
||||
[\uD81A][\uDC00-\uDE38] |
||||
| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF] |
||||
| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF] |
||||
| [\uD82C][\uDC00\uDC01] |
||||
| [\uD81B][\uDF00-\uDF44\uDF50-\uDF7E\uDF8F-\uDF9F] |
||||
| [\uD801][\uDC00-\uDC9D\uDCA0-\uDCA9] |
||||
| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF] |
||||
| [\uD87E][\uDC00-\uDE1D] |
||||
| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDD80-\uDDB7\uDDBE\uDDBF\uDE00-\uDE03\uDE05\uDE06\uDE0C-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE38-\uDE3A\uDE3F\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72] |
||||
| [\uD805][\uDE80-\uDEB7\uDEC0-\uDEC9] |
||||
| [\uD83B][\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB] |
||||
| [\uD809][\uDC00-\uDC62] |
||||
| [\uD808][\uDC00-\uDF6E] |
||||
| [\uD803][\uDC00-\uDC48] |
||||
| [\uD80D][\uDC00-\uDC2E] |
||||
| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDDFD\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5] |
||||
| [\uD804][\uDC00-\uDC46\uDC66-\uDC6F\uDC80-\uDCBA\uDCD0-\uDCE8\uDCF0-\uDCF9\uDD00-\uDD34\uDD36-\uDD3F\uDD80-\uDDC4\uDDD0-\uDDD9] |
||||
| [\uD86E][\uDC00-\uDC1D] |
||||
| [\uDB40][\uDD00-\uDDEF] |
||||
| [\uD834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44] |
||||
| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB\uDFCE-\uDFFF] |
||||
) |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,919 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.charfilter; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0 |
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.IOException; |
||||
import java.io.Reader; |
||||
import java.util.Arrays; |
||||
import java.util.HashMap; |
||||
import java.util.Map; |
||||
import java.util.Set; |
||||
|
||||
import com.fr.third.org.apache.lucene.util.Version; |
||||
import com.fr.third.org.apache.lucene.analysis.util.CharArrayMap; |
||||
import com.fr.third.org.apache.lucene.analysis.util.CharArraySet; |
||||
import com.fr.third.org.apache.lucene.analysis.util.OpenStringBuilder; |
||||
|
||||
|
||||
/** |
||||
* A CharFilter that wraps another Reader and attempts to strip out HTML constructs. |
||||
*/ |
||||
@SuppressWarnings("fallthrough") |
||||
%% |
||||
|
||||
%unicode 6.1 |
||||
%apiprivate |
||||
%type int |
||||
%final |
||||
%public |
||||
%char |
||||
%function nextChar |
||||
%class HTMLStripCharFilter |
||||
%extends BaseCharFilter |
||||
%xstate AMPERSAND, NUMERIC_CHARACTER, CHARACTER_REFERENCE_TAIL |
||||
%xstate LEFT_ANGLE_BRACKET, BANG, COMMENT, SCRIPT, SCRIPT_COMMENT |
||||
%xstate LEFT_ANGLE_BRACKET_SLASH, LEFT_ANGLE_BRACKET_SPACE, CDATA |
||||
%xstate SERVER_SIDE_INCLUDE, SINGLE_QUOTED_STRING, DOUBLE_QUOTED_STRING |
||||
%xstate END_TAG_TAIL_INCLUDE, END_TAG_TAIL_EXCLUDE, END_TAG_TAIL_SUBSTITUTE |
||||
%xstate START_TAG_TAIL_INCLUDE, START_TAG_TAIL_EXCLUDE, START_TAG_TAIL_SUBSTITUTE |
||||
%xstate STYLE, STYLE_COMMENT |
||||
|
||||
// From XML 1.0 <http://www.w3.org/TR/xml/>: |
||||
// |
||||
// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [...] |
||||
// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | [...] |
||||
// [5] Name ::= NameStartChar (NameChar)* |
||||
// |
||||
// From UAX #31: Unicode Identifier and Pattern Syntax |
||||
// <http://unicode.org/reports/tr31/>: |
||||
// |
||||
// D1. Default Identifier Syntax |
||||
// |
||||
// <identifier> := <ID_Start> <ID_Continue>* |
||||
// |
||||
Name = ( ( [:_\p{ID_Start}] | {ID_Start_Supp} ) ( [-.:_\p{ID_Continue}] | {ID_Continue_Supp} )* ) |
||||
|
||||
// From Apache httpd mod_include documentation |
||||
// <http://httpd.apache.org/docs/current/mod/mod_include.html>: |
||||
// |
||||
// Basic Elements |
||||
// |
||||
// The document is parsed as an HTML document, with special commands |
||||
// embedded as SGML comments. A command has the syntax: |
||||
// |
||||
// <!--#element attribute=value attribute=value ... --> |
||||
// |
||||
// The value will often be enclosed in double quotes, but single quotes (') |
||||
// and backticks (`) are also possible. Many commands only allow a single |
||||
// attribute-value pair. Note that the comment terminator (-->) should be |
||||
// preceded by whitespace to ensure that it isn't considered part of an SSI |
||||
// token. Note that the leading <!--# is one token and may not contain any |
||||
// whitespaces. |
||||
// |
||||
|
||||
EventAttributeSuffixes = ( [aA][bB][oO][rR][tT] | |
||||
[bB][lL][uU][rR] | |
||||
[cC][hH][aA][nN][gG][eE] | |
||||
[cC][lL][iI][cC][kK] | |
||||
[dD][bB][lL][cC][lL][iI][cC][kK] | |
||||
[eE][rR][rR][oO][rR] | |
||||
[fF][oO][cC][uU][sS] | |
||||
[kK][eE][yY][dD][oO][wW][nN] | |
||||
[kK][eE][yY][pP][rR][eE][sS][sS] | |
||||
[kK][eE][yY][uU][pP] | |
||||
[lL][oO][aA][dD] | |
||||
[mM][oO][uU][sS][eE][dD][oO][wW][nN] | |
||||
[mM][oO][uU][sS][eE][mM][oO][vV][eE] | |
||||
[mM][oO][uU][sS][eE][oO][uU][tT] | |
||||
[mM][oO][uU][sS][eE][oO][vV][eE][rR] | |
||||
[mM][oO][uU][sS][eE][uU][pP] | |
||||
[rR][eE][sS][eE][tT] | |
||||
[sS][eE][lL][eE][cC][tT] | |
||||
[sS][uU][bB][mM][iI][tT] | |
||||
[uU][nN][lL][oO][aA][dD] ) |
||||
|
||||
SingleQuoted = ( "'" ( "\\'" | [^']* )* "'" ) |
||||
DoubleQuoted = ( "\"" ( "\\\"" | [^\"]* )* "\"" ) |
||||
ServerSideInclude = ( "<!--#" ( [^'\"] | {SingleQuoted} | {DoubleQuoted} )* "-->" ) |
||||
EventAttribute = [oO][nN] {EventAttributeSuffixes} \s* "=" \s* ( {SingleQuoted} | {DoubleQuoted} ) |
||||
OpenTagContent = ( {EventAttribute} | [^<>] | {ServerSideInclude} )* |
||||
|
||||
InlineElment = ( [aAbBiIqQsSuU] | |
||||
[aA][bB][bB][rR] | |
||||
[aA][cC][rR][oO][nN][yY][mM] | |
||||
[bB][aA][sS][eE][fF][oO][nN][tT] | |
||||
[bB][dD][oO] | |
||||
[bB][iI][gG] | |
||||
[cC][iI][tT][eE] | |
||||
[cC][oO][dD][eE] | |
||||
[dD][fF][nN] | |
||||
[eE][mM] | |
||||
[fF][oO][nN][tT] | |
||||
[iI][mM][gG] | |
||||
[iI][nN][pP][uU][tT] | |
||||
[kK][bB][dD] | |
||||
[lL][aA][bB][eE][lL] | |
||||
[sS][aA][mM][pP] | |
||||
[sS][eE][lL][eE][cC][tT] | |
||||
[sS][mM][aA][lL][lL] | |
||||
[sS][pP][aA][nN] | |
||||
[sS][tT][rR][iI][kK][eE] | |
||||
[sS][tT][rR][oO][nN][gG] | |
||||
[sS][uU][bB] | |
||||
[sS][uU][pP] | |
||||
[tT][eE][xX][tT][aA][rR][eE][aA] | |
||||
[tT][tT] | |
||||
[vV][aA][rR] ) |
||||
|
||||
|
||||
%include HTMLCharacterEntities.jflex |
||||
|
||||
%include HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro |
||||
|
||||
%{ |
||||
private static final int INITIAL_INPUT_SEGMENT_SIZE = 1024; |
||||
private static final char BLOCK_LEVEL_START_TAG_REPLACEMENT = '\n'; |
||||
private static final char BLOCK_LEVEL_END_TAG_REPLACEMENT = '\n'; |
||||
private static final char BR_START_TAG_REPLACEMENT = '\n'; |
||||
private static final char BR_END_TAG_REPLACEMENT = '\n'; |
||||
private static final char SCRIPT_REPLACEMENT = '\n'; |
||||
private static final char STYLE_REPLACEMENT = '\n'; |
||||
private static final char REPLACEMENT_CHARACTER = '\uFFFD'; |
||||
|
||||
private CharArraySet escapedTags = null; |
||||
private int inputStart; |
||||
private int cumulativeDiff; |
||||
private boolean escapeBR = false; |
||||
private boolean escapeSCRIPT = false; |
||||
private boolean escapeSTYLE = false; |
||||
private int restoreState; |
||||
private int previousRestoreState; |
||||
private int outputCharCount; |
||||
private int eofReturnValue; |
||||
private TextSegment inputSegment |
||||
= new TextSegment(INITIAL_INPUT_SEGMENT_SIZE); |
||||
private TextSegment outputSegment = inputSegment; |
||||
private TextSegment entitySegment = new TextSegment(2); |
||||
|
||||
/** |
||||
* Creates a new HTMLStripCharFilter over the provided Reader. |
||||
* @param source Reader to strip html tags from. |
||||
*/ |
||||
public HTMLStripCharFilter(Reader source) { |
||||
super(source); |
||||
this.zzReader = source; |
||||
} |
||||
|
||||
/** |
||||
* Creates a new HTMLStripCharFilter over the provided Reader |
||||
* with the specified start and end tags. |
||||
* @param source Reader to strip html tags from. |
||||
* @param escapedTags Tags in this set (both start and end tags) |
||||
* will not be filtered out. |
||||
*/ |
||||
public HTMLStripCharFilter(Reader source, Set<String> escapedTags) { |
||||
super(source); |
||||
this.zzReader = source; |
||||
if (null != escapedTags) { |
||||
for (String tag : escapedTags) { |
||||
if (tag.equalsIgnoreCase("BR")) { |
||||
escapeBR = true; |
||||
} else if (tag.equalsIgnoreCase("SCRIPT")) { |
||||
escapeSCRIPT = true; |
||||
} else if (tag.equalsIgnoreCase("STYLE")) { |
||||
escapeSTYLE = true; |
||||
} else { |
||||
if (null == this.escapedTags) { |
||||
this.escapedTags = new CharArraySet(Version.LUCENE_40, 16, true); |
||||
} |
||||
this.escapedTags.add(tag); |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
@Override |
||||
public int read() throws IOException { |
||||
if (outputSegment.isRead()) { |
||||
if (zzAtEOF) { |
||||
return -1; |
||||
} |
||||
int ch = nextChar(); |
||||
++outputCharCount; |
||||
return ch; |
||||
} |
||||
int ch = outputSegment.nextChar(); |
||||
++outputCharCount; |
||||
return ch; |
||||
} |
||||
|
||||
@Override |
||||
public int read(char cbuf[], int off, int len) throws IOException { |
||||
int i = 0; |
||||
for ( ; i < len ; ++i) { |
||||
int ch = read(); |
||||
if (ch == -1) break; |
||||
cbuf[off++] = (char)ch; |
||||
} |
||||
return i > 0 ? i : (len == 0 ? 0 : -1); |
||||
} |
||||
|
||||
@Override |
||||
public void close() throws IOException { |
||||
yyclose(); |
||||
} |
||||
|
||||
static int getInitialBufferSize() { // Package private, for testing purposes |
||||
return ZZ_BUFFERSIZE; |
||||
} |
||||
|
||||
private class TextSegment extends OpenStringBuilder { |
||||
/** The position from which the next char will be read. */ |
||||
int pos = 0; |
||||
|
||||
/** Wraps the given buffer and sets this.len to the given length. */ |
||||
TextSegment(char[] buffer, int length) { |
||||
super(buffer, length); |
||||
} |
||||
|
||||
/** Allocates an internal buffer of the given size. */ |
||||
TextSegment(int size) { |
||||
super(size); |
||||
} |
||||
|
||||
/** Sets len = 0 and pos = 0. */ |
||||
void clear() { |
||||
reset(); |
||||
restart(); |
||||
} |
||||
|
||||
/** Sets pos = 0 */ |
||||
void restart() { |
||||
pos = 0; |
||||
} |
||||
|
||||
/** Returns the next char in the segment. */ |
||||
int nextChar() { |
||||
assert (! isRead()): "Attempting to read past the end of a segment."; |
||||
return buf[pos++]; |
||||
} |
||||
|
||||
/** Returns true when all characters in the text segment have been read */ |
||||
boolean isRead() { |
||||
return pos >= len; |
||||
} |
||||
} |
||||
%} |
||||
|
||||
%eofval{ |
||||
return eofReturnValue; |
||||
%eofval} |
||||
%eof{ |
||||
switch (zzLexicalState) { |
||||
case SCRIPT: |
||||
case COMMENT: |
||||
case SCRIPT_COMMENT: |
||||
case STYLE: |
||||
case STYLE_COMMENT: |
||||
case SINGLE_QUOTED_STRING: |
||||
case DOUBLE_QUOTED_STRING: |
||||
case END_TAG_TAIL_EXCLUDE: |
||||
case END_TAG_TAIL_SUBSTITUTE: |
||||
case START_TAG_TAIL_EXCLUDE: |
||||
case SERVER_SIDE_INCLUDE: |
||||
case START_TAG_TAIL_SUBSTITUTE: { // Exclude |
||||
// add (length of input that won't be output) [ - (substitution length) = 0 ] |
||||
cumulativeDiff += yychar - inputStart; |
||||
// position the correction at (already output length) [ + (substitution length) = 0 ] |
||||
addOffCorrectMap(outputCharCount, cumulativeDiff); |
||||
outputSegment.clear(); |
||||
eofReturnValue = -1; |
||||
break; |
||||
} |
||||
case CHARACTER_REFERENCE_TAIL: { // Substitute |
||||
// At end of file, allow char refs without semicolons |
||||
// add (length of input that won't be output) - (substitution length) |
||||
cumulativeDiff += inputSegment.length() - outputSegment.length(); |
||||
// position the correction at (already output length) + (substitution length) |
||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff); |
||||
eofReturnValue = outputSegment.nextChar(); |
||||
break; |
||||
} |
||||
case BANG: |
||||
case CDATA: |
||||
case AMPERSAND: |
||||
case NUMERIC_CHARACTER: |
||||
case END_TAG_TAIL_INCLUDE: |
||||
case START_TAG_TAIL_INCLUDE: |
||||
case LEFT_ANGLE_BRACKET: |
||||
case LEFT_ANGLE_BRACKET_SLASH: |
||||
case LEFT_ANGLE_BRACKET_SPACE: { // Include |
||||
outputSegment = inputSegment; |
||||
eofReturnValue = outputSegment.nextChar(); |
||||
break; |
||||
} |
||||
default: { |
||||
eofReturnValue = -1; |
||||
} |
||||
} |
||||
%eof} |
||||
|
||||
%% |
||||
|
||||
"&" { |
||||
inputStart = yychar; |
||||
inputSegment.clear(); |
||||
inputSegment.append('&'); |
||||
yybegin(AMPERSAND); |
||||
} |
||||
|
||||
"<" { |
||||
inputStart = yychar; |
||||
inputSegment.clear(); |
||||
inputSegment.append('<'); |
||||
yybegin(LEFT_ANGLE_BRACKET); |
||||
} |
||||
|
||||
<AMPERSAND> { |
||||
{CharacterEntities} { |
||||
int length = yylength(); |
||||
inputSegment.write(zzBuffer, zzStartRead, length); |
||||
entitySegment.clear(); |
||||
char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue(); |
||||
entitySegment.append(ch); |
||||
outputSegment = entitySegment; |
||||
yybegin(CHARACTER_REFERENCE_TAIL); |
||||
} |
||||
"#" { inputSegment.append('#'); yybegin(NUMERIC_CHARACTER); } |
||||
|
||||
// 1 1 11 11 |
||||
// 0 1 2 3 45 678 9 0 1 23 45 |
||||
"#" [xX][dD][89aAbB][0-9a-fA-F]{2} ";&#" [xX][dD][c-fC-F][0-9a-fA-F]{2} ";" { |
||||
// Handle paired UTF-16 surrogates. |
||||
outputSegment = entitySegment; |
||||
outputSegment.clear(); |
||||
String surrogatePair = yytext(); |
||||
char highSurrogate = '\u0000'; |
||||
try { |
||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16); |
||||
} catch(Exception e) { // should never happen |
||||
assert false: "Exception parsing high surrogate '" |
||||
+ surrogatePair.substring(2, 6) + "'"; |
||||
} |
||||
try { |
||||
outputSegment.unsafeWrite |
||||
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16)); |
||||
} catch(Exception e) { // should never happen |
||||
assert false: "Exception parsing low surrogate '" |
||||
+ surrogatePair.substring(10, 14) + "'"; |
||||
} |
||||
// add (previously matched input length) + (this match length) - (substitution length) |
||||
cumulativeDiff += inputSegment.length() + yylength() - 2; |
||||
// position the correction at (already output length) + (substitution length) |
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff); |
||||
inputSegment.clear(); |
||||
yybegin(YYINITIAL); |
||||
return highSurrogate; |
||||
} |
||||
|
||||
// 1 1 11 11 |
||||
// 01 2 345 678 9 0 1 23 45 |
||||
"#5" [56] \d{3} ";&#" [xX][dD][c-fC-F][0-9a-fA-F]{2} ";" { |
||||
// Handle paired UTF-16 surrogates. |
||||
String surrogatePair = yytext(); |
||||
char highSurrogate = '\u0000'; |
||||
try { // High surrogates are in decimal range [55296, 56319] |
||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6)); |
||||
} catch(Exception e) { // should never happen |
||||
assert false: "Exception parsing high surrogate '" |
||||
+ surrogatePair.substring(1, 6) + "'"; |
||||
} |
||||
if (Character.isHighSurrogate(highSurrogate)) { |
||||
outputSegment = entitySegment; |
||||
outputSegment.clear(); |
||||
try { |
||||
outputSegment.unsafeWrite |
||||
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16)); |
||||
} catch(Exception e) { // should never happen |
||||
assert false: "Exception parsing low surrogate '" |
||||
+ surrogatePair.substring(10, 14) + "'"; |
||||
} |
||||
// add (previously matched input length) + (this match length) - (substitution length) |
||||
cumulativeDiff += inputSegment.length() + yylength() - 2; |
||||
// position the correction at (already output length) + (substitution length) |
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff); |
||||
inputSegment.clear(); |
||||
yybegin(YYINITIAL); |
||||
return highSurrogate; |
||||
} |
||||
yypushback(surrogatePair.length() - 1); // Consume only '#' |
||||
inputSegment.append('#'); |
||||
yybegin(NUMERIC_CHARACTER); |
||||
} |
||||
|
||||
// 1 111 11 |
||||
// 0 1 2 3 45 6789 0 123 45 |
||||
"#" [xX][dD][89aAbB][0-9a-fA-F]{2} ";" [67] \d{3} ";" { |
||||
// Handle paired UTF-16 surrogates. |
||||
String surrogatePair = yytext(); |
||||
char highSurrogate = '\u0000'; |
||||
char lowSurrogate = '\u0000'; |
||||
try { |
||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16); |
||||
} catch(Exception e) { // should never happen |
||||
assert false: "Exception parsing high surrogate '" |
||||
+ surrogatePair.substring(2, 6) + "'"; |
||||
} |
||||
try { // Low surrogates are in decimal range [56320, 57343] |
||||
lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14)); |
||||
} catch(Exception e) { // should never happen |
||||
assert false: "Exception parsing low surrogate '" |
||||
+ surrogatePair.substring(9, 14) + "'"; |
||||
} |
||||
if (Character.isLowSurrogate(lowSurrogate)) { |
||||
outputSegment = entitySegment; |
||||
outputSegment.clear(); |
||||
outputSegment.unsafeWrite(lowSurrogate); |
||||
// add (previously matched input length) + (this match length) - (substitution length) |
||||
cumulativeDiff += inputSegment.length() + yylength() - 2; |
||||
// position the correction at (already output length) + (substitution length) |
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff); |
||||
inputSegment.clear(); |
||||
yybegin(YYINITIAL); |
||||
return highSurrogate; |
||||
} |
||||
yypushback(surrogatePair.length() - 1); // Consume only '#' |
||||
inputSegment.append('#'); |
||||
yybegin(NUMERIC_CHARACTER); |
||||
} |
||||
|
||||
// 1 111 11 |
||||
// 01 2 345 6789 0 123 45 |
||||
"#5" [56] \d{3} ";" [67] \d{3} ";" { |
||||
// Handle paired UTF-16 surrogates. |
||||
String surrogatePair = yytext(); |
||||
char highSurrogate = '\u0000'; |
||||
try { // High surrogates are in decimal range [55296, 56319] |
||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6)); |
||||
} catch(Exception e) { // should never happen |
||||
assert false: "Exception parsing high surrogate '" |
||||
+ surrogatePair.substring(1, 6) + "'"; |
||||
} |
||||
if (Character.isHighSurrogate(highSurrogate)) { |
||||
char lowSurrogate = '\u0000'; |
||||
try { // Low surrogates are in decimal range [56320, 57343] |
||||
lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14)); |
||||
} catch(Exception e) { // should never happen |
||||
assert false: "Exception parsing low surrogate '" |
||||
+ surrogatePair.substring(9, 14) + "'"; |
||||
} |
||||
if (Character.isLowSurrogate(lowSurrogate)) { |
||||
outputSegment = entitySegment; |
||||
outputSegment.clear(); |
||||
outputSegment.unsafeWrite(lowSurrogate); |
||||
// add (previously matched input length) + (this match length) - (substitution length) |
||||
cumulativeDiff += inputSegment.length() + yylength() - 2; |
||||
// position the correction at (already output length) + (substitution length) |
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff); |
||||
inputSegment.clear(); |
||||
yybegin(YYINITIAL); |
||||
return highSurrogate; |
||||
} |
||||
} |
||||
yypushback(surrogatePair.length() - 1); // Consume only '#' |
||||
inputSegment.append('#'); |
||||
yybegin(NUMERIC_CHARACTER); |
||||
} |
||||
} |
||||
|
||||
<NUMERIC_CHARACTER> { |
||||
[xX] [0-9A-Fa-f]+ { |
||||
int matchLength = yylength(); |
||||
inputSegment.write(zzBuffer, zzStartRead, matchLength); |
||||
if (matchLength <= 6) { // 10FFFF: max 6 hex chars |
||||
String hexCharRef |
||||
= new String(zzBuffer, zzStartRead + 1, matchLength - 1); |
||||
int codePoint = 0; |
||||
try { |
||||
codePoint = Integer.parseInt(hexCharRef, 16); |
||||
} catch(Exception e) { |
||||
assert false: "Exception parsing hex code point '" + hexCharRef + "'"; |
||||
} |
||||
if (codePoint <= 0x10FFFF) { |
||||
outputSegment = entitySegment; |
||||
outputSegment.clear(); |
||||
if (codePoint >= Character.MIN_SURROGATE |
||||
&& codePoint <= Character.MAX_SURROGATE) { |
||||
outputSegment.unsafeWrite(REPLACEMENT_CHARACTER); |
||||
} else { |
||||
outputSegment.setLength |
||||
(Character.toChars(codePoint, outputSegment.getArray(), 0)); |
||||
} |
||||
yybegin(CHARACTER_REFERENCE_TAIL); |
||||
} else { |
||||
outputSegment = inputSegment; |
||||
yybegin(YYINITIAL); |
||||
return outputSegment.nextChar(); |
||||
} |
||||
} else { |
||||
outputSegment = inputSegment; |
||||
yybegin(YYINITIAL); |
||||
return outputSegment.nextChar(); |
||||
} |
||||
} |
||||
[0-9]+ { |
||||
int matchLength = yylength(); |
||||
inputSegment.write(zzBuffer, zzStartRead, matchLength); |
||||
if (matchLength <= 7) { // 0x10FFFF = 1114111: max 7 decimal chars |
||||
String decimalCharRef = yytext(); |
||||
int codePoint = 0; |
||||
try { |
||||
codePoint = Integer.parseInt(decimalCharRef); |
||||
} catch(Exception e) { |
||||
assert false: "Exception parsing code point '" + decimalCharRef + "'"; |
||||
} |
||||
if (codePoint <= 0x10FFFF) { |
||||
outputSegment = entitySegment; |
||||
outputSegment.clear(); |
||||
if (codePoint >= Character.MIN_SURROGATE |
||||
&& codePoint <= Character.MAX_SURROGATE) { |
||||
outputSegment.unsafeWrite(REPLACEMENT_CHARACTER); |
||||
} else { |
||||
outputSegment.setLength |
||||
(Character.toChars(codePoint, outputSegment.getArray(), 0)); |
||||
} |
||||
yybegin(CHARACTER_REFERENCE_TAIL); |
||||
} else { |
||||
outputSegment = inputSegment; |
||||
yybegin(YYINITIAL); |
||||
return outputSegment.nextChar(); |
||||
} |
||||
} else { |
||||
outputSegment = inputSegment; |
||||
yybegin(YYINITIAL); |
||||
return outputSegment.nextChar(); |
||||
} |
||||
} |
||||
} |
||||
|
||||
<CHARACTER_REFERENCE_TAIL> { |
||||
";" { |
||||
// add (previously matched input length) + (this match length) - (substitution length) |
||||
cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length(); |
||||
// position the correction at (already output length) + (substitution length) |
||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff); |
||||
yybegin(YYINITIAL); |
||||
return outputSegment.nextChar(); |
||||
} |
||||
} |
||||
|
||||
<LEFT_ANGLE_BRACKET_SLASH> { |
||||
\s+ { inputSegment.write(zzBuffer, zzStartRead, yylength()); } |
||||
[bB][rR] \s* ">" { |
||||
yybegin(YYINITIAL); |
||||
if (escapeBR) { |
||||
inputSegment.write(zzBuffer, zzStartRead, yylength()); |
||||
outputSegment = inputSegment; |
||||
return outputSegment.nextChar(); |
||||
} else { |
||||
// add (previously matched input length) + (this match length) - (substitution length) |
||||
cumulativeDiff += inputSegment.length() + yylength() - 1; |
||||
// position the correction at (already output length) + (substitution length) |
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff); |
||||
inputSegment.reset(); |
||||
return BR_END_TAG_REPLACEMENT; |
||||
} |
||||
} |
||||
{InlineElment} { |
||||
inputSegment.write(zzBuffer, zzStartRead, yylength()); |
||||
if (null != escapedTags |
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) { |
||||
yybegin(END_TAG_TAIL_INCLUDE); |
||||
} else { |
||||
yybegin(END_TAG_TAIL_EXCLUDE); |
||||
} |
||||
} |
||||
{Name} { |
||||
inputSegment.write(zzBuffer, zzStartRead, yylength()); |
||||
if (null != escapedTags |
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) { |
||||
yybegin(END_TAG_TAIL_INCLUDE); |
||||
} else { |
||||
yybegin(END_TAG_TAIL_SUBSTITUTE); |
||||
} |
||||
} |
||||
} |
||||
|
||||
<END_TAG_TAIL_INCLUDE> { |
||||
\s* ">" { |
||||
inputSegment.write(zzBuffer, zzStartRead, yylength()); |
||||
outputSegment = inputSegment; |
||||
yybegin(YYINITIAL); |
||||
return outputSegment.nextChar(); |
||||
} |
||||
} |
||||
|
||||
<END_TAG_TAIL_EXCLUDE> { |
||||
\s* ">" { |
||||
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ] |
||||
cumulativeDiff += inputSegment.length() + yylength(); |
||||
// position the correction at (already output length) [ + (substitution length) = 0 ] |
||||
addOffCorrectMap(outputCharCount, cumulativeDiff); |
||||
inputSegment.clear(); |
||||
yybegin(YYINITIAL); |
||||
} |
||||
} |
||||
|
||||
<END_TAG_TAIL_SUBSTITUTE> { |
||||
\s* ">" { |
||||
// add (previously matched input length) + (this match length) - (substitution length) |
||||
cumulativeDiff += inputSegment.length() + yylength() - 1; |
||||
// position the correction at (already output length) + (substitution length) |
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff); |
||||
inputSegment.clear(); |
||||
yybegin(YYINITIAL); |
||||
return BLOCK_LEVEL_END_TAG_REPLACEMENT; |
||||
} |
||||
} |
||||
|
||||
<LEFT_ANGLE_BRACKET> { |
||||
"!" { inputSegment.append('!'); yybegin(BANG); } |
||||
"/" { inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH); } |
||||
\s+ { |
||||
inputSegment.write(zzBuffer, zzStartRead, yylength()); |
||||
yybegin(LEFT_ANGLE_BRACKET_SPACE); |
||||
} |
||||
"?" [^>]* [/?] ">" { |
||||
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ] |
||||
cumulativeDiff += inputSegment.length() + yylength(); |
||||
// position the correction at (already output length) [ + (substitution length) = 0 ] |
||||
addOffCorrectMap(outputCharCount, cumulativeDiff); |
||||
inputSegment.clear(); |
||||
yybegin(YYINITIAL); |
||||
} |
||||
\s* [bB][rR] ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" { |
||||
yybegin(YYINITIAL); |
||||
if (escapeBR) { |
||||
inputSegment.write(zzBuffer, zzStartRead, yylength()); |
||||
outputSegment = inputSegment; |
||||
return outputSegment.nextChar(); |
||||
} else { |
||||
// add (previously matched input length) + (this match length) - (substitution length) |
||||
cumulativeDiff += inputSegment.length() + yylength() - 1; |
||||
// position the correction at (already output length) + (substitution length) |
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff); |
||||
inputSegment.reset(); |
||||
return BR_START_TAG_REPLACEMENT; |
||||
} |
||||
} |
||||
\s* [sS][cC][rR][iI][pP][tT] ( \s+ {OpenTagContent} )? \s* ">" { |
||||
yybegin(SCRIPT); |
||||
if (escapeSCRIPT) { |
||||
inputSegment.write(zzBuffer, zzStartRead, yylength()); |
||||
outputSegment = inputSegment; |
||||
inputStart += 1 + yylength(); |
||||
return outputSegment.nextChar(); |
||||
} |
||||
} |
||||
\s* [sS][tT][yY][lL][eE] ( \s+ {OpenTagContent} )? \s* ">" { |
||||
yybegin(STYLE); |
||||
if (escapeSTYLE) { |
||||
inputSegment.write(zzBuffer, zzStartRead, yylength()); |
||||
outputSegment = inputSegment; |
||||
inputStart += 1 + yylength(); |
||||
return outputSegment.nextChar(); |
||||
} |
||||
} |
||||
} |
||||
|
||||
<LEFT_ANGLE_BRACKET, LEFT_ANGLE_BRACKET_SPACE> { |
||||
{InlineElment} { |
||||
inputSegment.write(zzBuffer, zzStartRead, yylength()); |
||||
if (null != escapedTags |
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) { |
||||
yybegin(START_TAG_TAIL_INCLUDE); |
||||
} else { |
||||
yybegin(START_TAG_TAIL_EXCLUDE); |
||||
} |
||||
} |
||||
{Name} { |
||||
inputSegment.write(zzBuffer, zzStartRead, yylength()); |
||||
if (null != escapedTags |
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) { |
||||
yybegin(START_TAG_TAIL_INCLUDE); |
||||
} else { |
||||
yybegin(START_TAG_TAIL_SUBSTITUTE); |
||||
} |
||||
} |
||||
} |
||||
|
||||
<START_TAG_TAIL_INCLUDE> { |
||||
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" { |
||||
inputSegment.write(zzBuffer, zzStartRead, yylength()); |
||||
outputSegment = inputSegment; |
||||
yybegin(YYINITIAL); |
||||
return outputSegment.nextChar(); |
||||
} |
||||
} |
||||
|
||||
<START_TAG_TAIL_EXCLUDE> { |
||||
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" { |
||||
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ] |
||||
cumulativeDiff += inputSegment.length() + yylength(); |
||||
// position the correction at (already output length) [ + (substitution length) = 0 ] |
||||
addOffCorrectMap(outputCharCount, cumulativeDiff); |
||||
inputSegment.clear(); |
||||
outputSegment = inputSegment; |
||||
yybegin(YYINITIAL); |
||||
} |
||||
} |
||||
|
||||
<START_TAG_TAIL_SUBSTITUTE> { |
||||
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" { |
||||
// add (previously matched input length) + (this match length) - (substitution length) |
||||
cumulativeDiff += inputSegment.length() + yylength() - 1; |
||||
// position the correction at (already output length) + (substitution length) |
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff); |
||||
inputSegment.clear(); |
||||
yybegin(YYINITIAL); |
||||
return BLOCK_LEVEL_START_TAG_REPLACEMENT; |
||||
} |
||||
} |
||||
|
||||
<BANG> { |
||||
"--" { yybegin(COMMENT); } |
||||
">" { |
||||
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ] |
||||
cumulativeDiff += inputSegment.length() + yylength(); |
||||
// position the correction at (already output length) [ + (substitution length) = 0 ] |
||||
addOffCorrectMap(outputCharCount, cumulativeDiff); |
||||
inputSegment.clear(); |
||||
yybegin(YYINITIAL); |
||||
} |
||||
// From XML 1.0 <http://www.w3.org/TR/xml/>: |
||||
// |
||||
// [18] CDSect ::= CDStart CData CDEnd |
||||
// [19] CDStart ::= '<![CDATA[' |
||||
// [20] CData ::= (Char* - (Char* ']]>' Char*)) |
||||
// [21] CDEnd ::= ']]>' |
||||
// |
||||
"[CDATA[" { |
||||
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ] |
||||
cumulativeDiff += inputSegment.length() + yylength(); |
||||
// position the correction at (already output length) [ + (substitution length) = 0 ] |
||||
addOffCorrectMap(outputCharCount, cumulativeDiff); |
||||
inputSegment.clear(); |
||||
yybegin(CDATA); |
||||
} |
||||
[^] { |
||||
inputSegment.append(zzBuffer[zzStartRead]); |
||||
} |
||||
} |
||||
|
||||
<CDATA> { |
||||
"]]>" { |
||||
// add (this match length) [ - (substitution length) = 0 ] |
||||
cumulativeDiff += yylength(); |
||||
// position the correction at (already output length) [ + (substitution length) = 0 ] |
||||
addOffCorrectMap(outputCharCount, cumulativeDiff); |
||||
yybegin(YYINITIAL); |
||||
} |
||||
[^] { return zzBuffer[zzStartRead]; } |
||||
} |
||||
|
||||
<COMMENT> { |
||||
"<!--#" { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE); } |
||||
"-->" { |
||||
// add (previously matched input length) + (this match length) [ - (substitution length) = 0] |
||||
cumulativeDiff += yychar - inputStart + yylength(); |
||||
// position the correction at (already output length) [ + (substitution length) = 0] |
||||
addOffCorrectMap(outputCharCount, cumulativeDiff); |
||||
inputSegment.clear(); |
||||
yybegin(YYINITIAL); |
||||
} |
||||
[^] { } |
||||
} |
||||
|
||||
<SERVER_SIDE_INCLUDE> { |
||||
"-->" { yybegin(restoreState); } |
||||
"'" { |
||||
previousRestoreState = restoreState; |
||||
restoreState = SERVER_SIDE_INCLUDE; |
||||
yybegin(SINGLE_QUOTED_STRING); |
||||
} |
||||
"\"" { |
||||
previousRestoreState = restoreState; |
||||
restoreState = SERVER_SIDE_INCLUDE; |
||||
yybegin(DOUBLE_QUOTED_STRING); |
||||
} |
||||
[^] { } |
||||
} |
||||
|
||||
<SCRIPT_COMMENT> { |
||||
"<!--#" { restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE); } |
||||
"'" { restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING); } |
||||
"\"" { restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING); } |
||||
"-->" { yybegin(SCRIPT); } |
||||
[^] { } |
||||
} |
||||
|
||||
<STYLE_COMMENT> { |
||||
"<!--#" { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE); } |
||||
"'" { restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING); } |
||||
"\"" { restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING); } |
||||
"-->" { yybegin(STYLE); } |
||||
[^] { } |
||||
} |
||||
|
||||
<SINGLE_QUOTED_STRING> { |
||||
"\\" [^] { } |
||||
"'" { yybegin(restoreState); restoreState = previousRestoreState; } |
||||
[^] { } |
||||
} |
||||
|
||||
<DOUBLE_QUOTED_STRING> { |
||||
"\\" [^] { } |
||||
"\"" { yybegin(restoreState); restoreState = previousRestoreState; } |
||||
[^] { } |
||||
} |
||||
|
||||
<SCRIPT> { |
||||
"<!--" { yybegin(SCRIPT_COMMENT); } |
||||
"</" \s* [sS][cC][rR][iI][pP][tT] \s* ">" { |
||||
inputSegment.clear(); |
||||
yybegin(YYINITIAL); |
||||
// add (previously matched input length) -- current match and substitution handled below |
||||
cumulativeDiff += yychar - inputStart; |
||||
// position at (already output length) -- substitution handled below |
||||
int offsetCorrectionPos = outputCharCount; |
||||
int returnValue; |
||||
if (escapeSCRIPT) { |
||||
inputSegment.write(zzBuffer, zzStartRead, yylength()); |
||||
outputSegment = inputSegment; |
||||
returnValue = outputSegment.nextChar(); |
||||
} else { |
||||
// add (this match length) - (substitution length) |
||||
cumulativeDiff += yylength() - 1; |
||||
// add (substitution length) |
||||
++offsetCorrectionPos; |
||||
returnValue = SCRIPT_REPLACEMENT; |
||||
} |
||||
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff); |
||||
return returnValue; |
||||
} |
||||
[^] { } |
||||
} |
||||
|
||||
<STYLE> { |
||||
"<!--" { yybegin(STYLE_COMMENT); } |
||||
"</" \s* [sS][tT][yY][lL][eE] \s* ">" { |
||||
inputSegment.clear(); |
||||
yybegin(YYINITIAL); |
||||
// add (previously matched input length) -- current match and substitution handled below |
||||
cumulativeDiff += yychar - inputStart; |
||||
// position the offset correction at (already output length) -- substitution handled below |
||||
int offsetCorrectionPos = outputCharCount; |
||||
int returnValue; |
||||
if (escapeSTYLE) { |
||||
inputSegment.write(zzBuffer, zzStartRead, yylength()); |
||||
outputSegment = inputSegment; |
||||
returnValue = outputSegment.nextChar(); |
||||
} else { |
||||
// add (this match length) - (substitution length) |
||||
cumulativeDiff += yylength() - 1; |
||||
// add (substitution length) |
||||
++offsetCorrectionPos; |
||||
returnValue = STYLE_REPLACEMENT; |
||||
} |
||||
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff); |
||||
return returnValue; |
||||
} |
||||
[^] { } |
||||
} |
||||
|
||||
<AMPERSAND,NUMERIC_CHARACTER,CHARACTER_REFERENCE_TAIL,LEFT_ANGLE_BRACKET_SLASH,END_TAG_TAIL_INCLUDE,END_TAG_TAIL_EXCLUDE,END_TAG_TAIL_SUBSTITUTE,LEFT_ANGLE_BRACKET,LEFT_ANGLE_BRACKET_SPACE,START_TAG_TAIL_INCLUDE,START_TAG_TAIL_EXCLUDE,START_TAG_TAIL_SUBSTITUTE,BANG> { |
||||
[^] { |
||||
yypushback(1); |
||||
outputSegment = inputSegment; |
||||
outputSegment.restart(); |
||||
yybegin(YYINITIAL); |
||||
return outputSegment.nextChar(); |
||||
} |
||||
} |
||||
|
||||
[^] { return zzBuffer[zzStartRead]; } |
@ -0,0 +1,70 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.charfilter; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.charfilter.HTMLStripCharFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.util.CharFilterFactory; |
||||
|
||||
import java.io.Reader; |
||||
import java.util.HashSet; |
||||
import java.util.Map; |
||||
import java.util.Set; |
||||
import java.util.regex.Matcher; |
||||
import java.util.regex.Pattern; |
||||
|
||||
/** |
||||
* Factory for {@link HTMLStripCharFilter}. |
||||
* <pre class="prettyprint" > |
||||
* <fieldType name="text_html" class="solr.TextField" positionIncrementGap="100"> |
||||
* <analyzer> |
||||
* <charFilter class="solr.HTMLStripCharFilterFactory" escapedTags="a, title" /> |
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
||||
* </analyzer> |
||||
* </fieldType></pre> |
||||
* |
||||
*/ |
||||
public class HTMLStripCharFilterFactory extends CharFilterFactory { |
||||
|
||||
Set<String> escapedTags = null; |
||||
Pattern TAG_NAME_PATTERN = Pattern.compile("[^\\s,]+"); |
||||
|
||||
public HTMLStripCharFilter create(Reader input) { |
||||
HTMLStripCharFilter charFilter; |
||||
if (null == escapedTags) { |
||||
charFilter = new HTMLStripCharFilter(input); |
||||
} else { |
||||
charFilter = new HTMLStripCharFilter(input, escapedTags); |
||||
} |
||||
return charFilter; |
||||
} |
||||
|
||||
@Override |
||||
public void init(Map<String,String> args) { |
||||
super.init(args); |
||||
String escapedTagsArg = args.get("escapedTags"); |
||||
if (null != escapedTagsArg) { |
||||
Matcher matcher = TAG_NAME_PATTERN.matcher(escapedTagsArg); |
||||
while (matcher.find()) { |
||||
if (null == escapedTags) { |
||||
escapedTags = new HashSet<String>(); |
||||
} |
||||
escapedTags.add(matcher.group(0)); |
||||
} |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,191 @@
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package com.fr.third.org.apache.lucene.analysis.charfilter; |
||||
|
||||
import java.io.IOException; |
||||
import java.io.Reader; |
||||
import java.util.Map; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.CharFilter; // javadocs
|
||||
import com.fr.third.org.apache.lucene.analysis.util.RollingCharBuffer; |
||||
import com.fr.third.org.apache.lucene.util.CharsRef; |
||||
import com.fr.third.org.apache.lucene.util.fst.CharSequenceOutputs; |
||||
import com.fr.third.org.apache.lucene.util.fst.FST; |
||||
import com.fr.third.org.apache.lucene.util.fst.Outputs; |
||||
|
||||
/** |
||||
* Simplistic {@link CharFilter} that applies the mappings |
||||
* contained in a {@link NormalizeCharMap} to the character |
||||
* stream, and correcting the resulting changes to the |
||||
* offsets. Matching is greedy (longest pattern matching at |
||||
* a given point wins). Replacement is allowed to be the |
||||
* empty string. |
||||
*/ |
||||
|
||||
public class MappingCharFilter extends BaseCharFilter { |
||||
|
||||
private final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); |
||||
private final FST<CharsRef> map; |
||||
private final FST.BytesReader fstReader; |
||||
private final RollingCharBuffer buffer = new RollingCharBuffer(); |
||||
private final FST.Arc<CharsRef> scratchArc = new FST.Arc<CharsRef>(); |
||||
private final Map<Character,FST.Arc<CharsRef>> cachedRootArcs; |
||||
|
||||
private CharsRef replacement; |
||||
private int replacementPointer; |
||||
private int inputOff; |
||||
|
||||
/** Default constructor that takes a {@link Reader}. */ |
||||
public MappingCharFilter(NormalizeCharMap normMap, Reader in) { |
||||
super(in); |
||||
buffer.reset(in); |
||||
|
||||
map = normMap.map; |
||||
cachedRootArcs = normMap.cachedRootArcs; |
||||
|
||||
if (map != null) { |
||||
fstReader = map.getBytesReader(0); |
||||
} else { |
||||
fstReader = null; |
||||
} |
||||
} |
||||
|
||||
@Override |
||||
public void reset() throws IOException { |
||||
input.reset(); |
||||
buffer.reset(input); |
||||
replacement = null; |
||||
inputOff = 0; |
||||
} |
||||
|
||||
@Override |
||||
public int read() throws IOException { |
||||
|
||||
//System.out.println("\nread");
|
||||
while(true) { |
||||
|
||||
if (replacement != null && replacementPointer < replacement.length) { |
||||
//System.out.println(" return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]);
|
||||
return replacement.chars[replacement.offset + replacementPointer++]; |
||||
} |
||||
|
||||
// TODO: a more efficient approach would be Aho/Corasick's
|
||||
// algorithm
|
||||
// (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm)
|
||||
// or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
|
||||
//
|
||||
// I think this would be (almost?) equivalent to 1) adding
|
||||
// epsilon arcs from all final nodes back to the init
|
||||
// node in the FST, 2) adding a .* (skip any char)
|
||||
// loop on the initial node, and 3) determinizing
|
||||
// that. Then we would not have to restart matching
|
||||
// at each position.
|
||||
|
||||
int lastMatchLen = -1; |
||||
CharsRef lastMatch = null; |
||||
|
||||
final int firstCH = buffer.get(inputOff); |
||||
if (firstCH != -1) { |
||||
FST.Arc<CharsRef> arc = cachedRootArcs.get(Character.valueOf((char) firstCH)); |
||||
if (arc != null) { |
||||
if (!FST.targetHasArcs(arc)) { |
||||
// Fast pass for single character match:
|
||||
assert arc.isFinal(); |
||||
lastMatchLen = 1; |
||||
lastMatch = arc.output; |
||||
} else { |
||||
int lookahead = 0; |
||||
CharsRef output = arc.output; |
||||
while (true) { |
||||
lookahead++; |
||||
|
||||
if (arc.isFinal()) { |
||||
// Match! (to node is final)
|
||||
lastMatchLen = lookahead; |
||||
lastMatch = outputs.add(output, arc.nextFinalOutput); |
||||
// Greedy: keep searching to see if there's a
|
||||
// longer match...
|
||||
} |
||||
|
||||
if (!FST.targetHasArcs(arc)) { |
||||
break; |
||||
} |
||||
|
||||
int ch = buffer.get(inputOff + lookahead); |
||||
if (ch == -1) { |
||||
break; |
||||
} |
||||
if ((arc = map.findTargetArc(ch, arc, scratchArc, fstReader)) == null) { |
||||
// Dead end
|
||||
break; |
||||
} |
||||
output = outputs.add(output, arc.output); |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
if (lastMatch != null) { |
||||
inputOff += lastMatchLen; |
||||
//System.out.println(" match! len=" + lastMatchLen + " repl=" + lastMatch);
|
||||
|
||||
final int diff = lastMatchLen - lastMatch.length; |
||||
|
||||
if (diff != 0) { |
||||
final int prevCumulativeDiff = getLastCumulativeDiff(); |
||||
if (diff > 0) { |
||||
// Replacement is shorter than matched input:
|
||||
addOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff); |
||||
} else { |
||||
// Replacement is longer than matched input: remap
|
||||
// the "extra" chars all back to the same input
|
||||
// offset:
|
||||
final int outputStart = inputOff - prevCumulativeDiff; |
||||
for(int extraIDX=0;extraIDX<-diff;extraIDX++) { |
||||
addOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1); |
||||
} |
||||
} |
||||
} |
||||
|
||||
replacement = lastMatch; |
||||
replacementPointer = 0; |
||||
|
||||
} else { |
||||
final int ret = buffer.get(inputOff); |
||||
if (ret != -1) { |
||||
inputOff++; |
||||
buffer.freeBefore(inputOff); |
||||
} |
||||
return ret; |
||||
} |
||||
} |
||||
} |
||||
|
||||
@Override |
||||
public int read(char[] cbuf, int off, int len) throws IOException { |
||||
int numRead = 0; |
||||
for(int i = off; i < off + len; i++) { |
||||
int c = read(); |
||||
if (c == -1) break; |
||||
cbuf[i] = (char) c; |
||||
numRead++; |
||||
} |
||||
|
||||
return numRead == 0 ? -1 : numRead; |
||||
} |
||||
} |
@ -0,0 +1,135 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.charfilter; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.File; |
||||
import java.io.IOException; |
||||
import java.io.Reader; |
||||
import java.util.ArrayList; |
||||
import java.util.List; |
||||
import java.util.regex.Matcher; |
||||
import java.util.regex.Pattern; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.charfilter.MappingCharFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.charfilter.NormalizeCharMap; |
||||
import com.fr.third.org.apache.lucene.analysis.util.*; |
||||
|
||||
/** |
||||
* Factory for {@link MappingCharFilter}. |
||||
* <pre class="prettyprint" > |
||||
* <fieldType name="text_map" class="solr.TextField" positionIncrementGap="100"> |
||||
* <analyzer> |
||||
* <charFilter class="solr.MappingCharFilterFactory" mapping="mapping.txt"/> |
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
||||
* </analyzer> |
||||
* </fieldType></pre> |
||||
* |
||||
* |
||||
* @since Solr 1.4 |
||||
* |
||||
*/ |
||||
public class MappingCharFilterFactory extends CharFilterFactory implements |
||||
ResourceLoaderAware, MultiTermAwareComponent { |
||||
|
||||
protected NormalizeCharMap normMap; |
||||
private String mapping; |
||||
|
||||
// TODO: this should use inputstreams from the loader, not File!
|
||||
public void inform(ResourceLoader loader) throws IOException { |
||||
mapping = args.get("mapping"); |
||||
|
||||
if (mapping != null) { |
||||
List<String> wlist = null; |
||||
File mappingFile = new File(mapping); |
||||
if (mappingFile.exists()) { |
||||
wlist = getLines(loader, mapping); |
||||
} else { |
||||
List<String> files = splitFileNames(mapping); |
||||
wlist = new ArrayList<String>(); |
||||
for (String file : files) { |
||||
List<String> lines = getLines(loader, file.trim()); |
||||
wlist.addAll(lines); |
||||
} |
||||
} |
||||
final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); |
||||
parseRules(wlist, builder); |
||||
normMap = builder.build(); |
||||
if (normMap.map == null) { |
||||
// if the inner FST is null, it means it accepts nothing (e.g. the file is empty)
|
||||
// so just set the whole map to null
|
||||
normMap = null; |
||||
} |
||||
} |
||||
} |
||||
|
||||
public Reader create(Reader input) { |
||||
// if the map is null, it means there's actually no mappings... just return the original stream
|
||||
// as there is nothing to do here.
|
||||
return normMap == null ? input : new MappingCharFilter(normMap,input); |
||||
} |
||||
|
||||
// "source" => "target"
|
||||
static Pattern p = Pattern.compile( "\"(.*)\"\\s*=>\\s*\"(.*)\"\\s*$" ); |
||||
|
||||
protected void parseRules( List<String> rules, NormalizeCharMap.Builder builder ){ |
||||
for( String rule : rules ){ |
||||
Matcher m = p.matcher( rule ); |
||||
if( !m.find() ) |
||||
throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "], file = " + mapping); |
||||
builder.add( parseString( m.group( 1 ) ), parseString( m.group( 2 ) ) ); |
||||
} |
||||
} |
||||
|
||||
char[] out = new char[256]; |
||||
|
||||
protected String parseString( String s ){ |
||||
int readPos = 0; |
||||
int len = s.length(); |
||||
int writePos = 0; |
||||
while( readPos < len ){ |
||||
char c = s.charAt( readPos++ ); |
||||
if( c == '\\' ){ |
||||
if( readPos >= len ) |
||||
throw new IllegalArgumentException("Invalid escaped char in [" + s + "]"); |
||||
c = s.charAt( readPos++ ); |
||||
switch( c ) { |
||||
case '\\' : c = '\\'; break; |
||||
case '"' : c = '"'; break; |
||||
case 'n' : c = '\n'; break; |
||||
case 't' : c = '\t'; break; |
||||
case 'r' : c = '\r'; break; |
||||
case 'b' : c = '\b'; break; |
||||
case 'f' : c = '\f'; break; |
||||
case 'u' : |
||||
if( readPos + 3 >= len ) |
||||
throw new IllegalArgumentException("Invalid escaped char in [" + s + "]"); |
||||
c = (char)Integer.parseInt( s.substring( readPos, readPos + 4 ), 16 ); |
||||
readPos += 4; |
||||
break; |
||||
} |
||||
} |
||||
out[writePos++] = c; |
||||
} |
||||
return new String( out, 0, writePos ); |
||||
} |
||||
|
||||
@Override |
||||
public AbstractAnalysisFactory getMultiTermComponent() { |
||||
return this; |
||||
} |
||||
} |
@ -0,0 +1,127 @@
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package com.fr.third.org.apache.lucene.analysis.charfilter; |
||||
|
||||
import java.io.IOException; |
||||
import java.util.HashMap; |
||||
import java.util.Map; |
||||
import java.util.TreeMap; |
||||
|
||||
import com.fr.third.org.apache.lucene.util.CharsRef; |
||||
import com.fr.third.org.apache.lucene.util.IntsRef; |
||||
import com.fr.third.org.apache.lucene.util.fst.Builder; |
||||
import com.fr.third.org.apache.lucene.util.fst.CharSequenceOutputs; |
||||
import com.fr.third.org.apache.lucene.util.fst.FST; |
||||
import com.fr.third.org.apache.lucene.util.fst.Outputs; |
||||
import com.fr.third.org.apache.lucene.util.fst.Util; |
||||
|
||||
// TODO: save/load?
|
||||
|
||||
/** |
||||
* Holds a map of String input to String output, to be used |
||||
* with {@link MappingCharFilter}. Use the {@link Builder} |
||||
* to create this. |
||||
*/ |
||||
public class NormalizeCharMap { |
||||
|
||||
final FST<CharsRef> map; |
||||
final Map<Character,FST.Arc<CharsRef>> cachedRootArcs = new HashMap<Character,FST.Arc<CharsRef>>(); |
||||
|
||||
// Use the builder to create:
|
||||
private NormalizeCharMap(FST<CharsRef> map) { |
||||
this.map = map; |
||||
if (map != null) { |
||||
try { |
||||
// Pre-cache root arcs:
|
||||
final FST.Arc<CharsRef> scratchArc = new FST.Arc<CharsRef>(); |
||||
final FST.BytesReader fstReader = map.getBytesReader(0); |
||||
map.getFirstArc(scratchArc); |
||||
if (FST.targetHasArcs(scratchArc)) { |
||||
map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader); |
||||
while(true) { |
||||
assert scratchArc.label != FST.END_LABEL; |
||||
cachedRootArcs.put(Character.valueOf((char) scratchArc.label), new FST.Arc<CharsRef>().copyFrom(scratchArc)); |
||||
if (scratchArc.isLast()) { |
||||
break; |
||||
} |
||||
map.readNextRealArc(scratchArc, fstReader); |
||||
} |
||||
} |
||||
//System.out.println("cached " + cachedRootArcs.size() + " root arcs");
|
||||
} catch (IOException ioe) { |
||||
// Bogus FST IOExceptions!! (will never happen)
|
||||
throw new RuntimeException(ioe); |
||||
} |
||||
} |
||||
} |
||||
|
||||
/** |
||||
* Builds an NormalizeCharMap. |
||||
* <p> |
||||
* Call add() until you have added all the mappings, then call build() to get a NormalizeCharMap |
||||
* @lucene.experimental |
||||
*/ |
||||
public static class Builder { |
||||
|
||||
private final Map<String,String> pendingPairs = new TreeMap<String,String>(); |
||||
|
||||
/** Records a replacement to be applied to the input |
||||
* stream. Whenever <code>singleMatch</code> occurs in |
||||
* the input, it will be replaced with |
||||
* <code>replacement</code>. |
||||
* |
||||
* @param match input String to be replaced |
||||
* @param replacement output String |
||||
* @throws IllegalArgumentException if |
||||
* <code>match</code> is the empty string, or was |
||||
* already previously added |
||||
*/ |
||||
public void add(String match, String replacement) { |
||||
if (match.length() == 0 ){ |
||||
throw new IllegalArgumentException("cannot match the empty string"); |
||||
} |
||||
if (pendingPairs.containsKey(match)) { |
||||
throw new IllegalArgumentException("match \"" + match + "\" was already added"); |
||||
} |
||||
pendingPairs.put(match, replacement); |
||||
} |
||||
|
||||
/** Builds the NormalizeCharMap; call this once you |
||||
* are done calling {@link #add}. */ |
||||
public NormalizeCharMap build() { |
||||
|
||||
final FST<CharsRef> map; |
||||
try { |
||||
final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); |
||||
final com.fr.third.org.apache.lucene.util.fst.Builder<CharsRef> builder = new com.fr.third.org.apache.lucene.util.fst.Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs); |
||||
final IntsRef scratch = new IntsRef(); |
||||
for(Map.Entry<String,String> ent : pendingPairs.entrySet()) { |
||||
builder.add(Util.toUTF16(ent.getKey(), scratch), |
||||
new CharsRef(ent.getValue())); |
||||
} |
||||
map = builder.finish(); |
||||
pendingPairs.clear(); |
||||
} catch (IOException ioe) { |
||||
// Bogus FST IOExceptions!! (will never happen)
|
||||
throw new RuntimeException(ioe); |
||||
} |
||||
|
||||
return new NormalizeCharMap(map); |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,539 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more |
||||
# contributor license agreements. See the NOTICE file distributed with |
||||
# this work for additional information regarding copyright ownership. |
||||
# The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
# (the "License"); you may not use this file except in compliance with |
||||
# the License. You may obtain a copy of the License at |
||||
# |
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
# |
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
import re |
||||
|
||||
# A simple python script to generate an HTML entity map and a regex alternation |
||||
# for inclusion in HTMLStripCharFilter.jflex. |
||||
|
||||
def main(): |
||||
print get_apache_license() |
||||
codes = {} |
||||
regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"') |
||||
for line in get_entity_text().split('\n'): |
||||
match = regex.match(line) |
||||
if match: |
||||
key = match.group(1) |
||||
if key == 'quot': codes[key] = r'\"' |
||||
elif key == 'nbsp': codes[key] = ' '; |
||||
else : codes[key] = r'\u%04X' % int(match.group(2)) |
||||
|
||||
keys = sorted(codes) |
||||
|
||||
first_entry = True |
||||
output_line = 'CharacterEntities = ( ' |
||||
for key in keys: |
||||
new_entry = ('"%s"' if first_entry else ' | "%s"') % key |
||||
first_entry = False |
||||
if len(output_line) + len(new_entry) >= 80: |
||||
print output_line |
||||
output_line = ' ' |
||||
output_line += new_entry |
||||
if key in ('quot','copy','gt','lt','reg','amp'): |
||||
new_entry = ' | "%s"' % key.upper() |
||||
if len(output_line) + len(new_entry) >= 80: |
||||
print output_line |
||||
output_line = ' ' |
||||
output_line += new_entry |
||||
print output_line, ')' |
||||
|
||||
print '%{' |
||||
print ' private static final Map<String,String> upperCaseVariantsAccepted' |
||||
print ' = new HashMap<String,String>();' |
||||
print ' static {' |
||||
print ' upperCaseVariantsAccepted.put("quot", "QUOT");' |
||||
print ' upperCaseVariantsAccepted.put("copy", "COPY");' |
||||
print ' upperCaseVariantsAccepted.put("gt", "GT");' |
||||
print ' upperCaseVariantsAccepted.put("lt", "LT");' |
||||
print ' upperCaseVariantsAccepted.put("reg", "REG");' |
||||
print ' upperCaseVariantsAccepted.put("amp", "AMP");' |
||||
print ' }' |
||||
print ' private static final CharArrayMap<Character> entityValues' |
||||
print ' = new CharArrayMap<Character>(Version.LUCENE_40, %i, false);' % len(keys) |
||||
print ' static {' |
||||
print ' String[] entities = {' |
||||
output_line = ' ' |
||||
for key in keys: |
||||
new_entry = ' "%s", "%s",' % (key, codes[key]) |
||||
if len(output_line) + len(new_entry) >= 80: |
||||
print output_line |
||||
output_line = ' ' |
||||
output_line += new_entry |
||||
print output_line[:-1] |
||||
print ' };' |
||||
print ' for (int i = 0 ; i < entities.length ; i += 2) {' |
||||
print ' Character value = entities[i + 1].charAt(0);' |
||||
print ' entityValues.put(entities[i], value);' |
||||
print ' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);' |
||||
print ' if (upperCaseVariant != null) {' |
||||
print ' entityValues.put(upperCaseVariant, value);' |
||||
print ' }' |
||||
print ' }' |
||||
print " }" |
||||
print "%}" |
||||
|
||||
def get_entity_text(): |
||||
# The text below is taken verbatim from |
||||
# <http://www.w3.org/TR/REC-html40/sgml/entities.html>: |
||||
text = r""" |
||||
F.1. XHTML Character Entities |
||||
|
||||
XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section. |
||||
F.1.1. XHTML Latin 1 Character Entities |
||||
|
||||
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-lat1.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent. |
||||
|
||||
<!-- ...................................................................... --> |
||||
<!-- XML-compatible ISO Latin 1 Character Entity Set for XHTML ............ --> |
||||
<!-- file: xhtml-lat1.ent |
||||
|
||||
Typical invocation: |
||||
|
||||
<!ENTITY % xhtml-lat1 |
||||
PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN" |
||||
"xhtml-lat1.ent" > |
||||
%xhtml-lat1; |
||||
|
||||
This DTD module is identified by the PUBLIC and SYSTEM identifiers: |
||||
|
||||
PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN" |
||||
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent" |
||||
|
||||
Revision: $Id: xhtml-lat1.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI |
||||
|
||||
Portions (C) International Organization for Standardization 1986: |
||||
Permission to copy in any form is granted for use with conforming |
||||
SGML systems and applications as defined in ISO 8879, provided |
||||
this notice is included in all copies. |
||||
--> |
||||
|
||||
<!ENTITY nbsp " " ><!-- no-break space = non-breaking space, U+00A0 ISOnum --> |
||||
<!ENTITY iexcl "¡" ><!-- inverted exclamation mark, U+00A1 ISOnum --> |
||||
<!ENTITY cent "¢" ><!-- cent sign, U+00A2 ISOnum --> |
||||
<!ENTITY pound "£" ><!-- pound sign, U+00A3 ISOnum --> |
||||
<!ENTITY curren "¤" ><!-- currency sign, U+00A4 ISOnum --> |
||||
<!ENTITY yen "¥" ><!-- yen sign = yuan sign, U+00A5 ISOnum --> |
||||
<!ENTITY brvbar "¦" ><!-- broken bar = broken vertical bar, U+00A6 ISOnum --> |
||||
<!ENTITY sect "§" ><!-- section sign, U+00A7 ISOnum --> |
||||
<!ENTITY uml "¨" ><!-- diaeresis = spacing diaeresis, U+00A8 ISOdia --> |
||||
<!ENTITY copy "©" ><!-- copyright sign, U+00A9 ISOnum --> |
||||
<!ENTITY ordf "ª" ><!-- feminine ordinal indicator, U+00AA ISOnum --> |
||||
<!ENTITY laquo "«" ><!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum --> |
||||
<!ENTITY not "¬" ><!-- not sign, U+00AC ISOnum --> |
||||
<!ENTITY shy "­" ><!-- soft hyphen = discretionary hyphen, U+00AD ISOnum --> |
||||
<!ENTITY reg "®" ><!-- registered sign = registered trade mark sign, U+00AE ISOnum --> |
||||
<!ENTITY macr "¯" ><!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia --> |
||||
<!ENTITY deg "°" ><!-- degree sign, U+00B0 ISOnum --> |
||||
<!ENTITY plusmn "±" ><!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum --> |
||||
<!ENTITY sup2 "²" ><!-- superscript two = superscript digit two = squared, U+00B2 ISOnum --> |
||||
<!ENTITY sup3 "³" ><!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum --> |
||||
<!ENTITY acute "´" ><!-- acute accent = spacing acute, U+00B4 ISOdia --> |
||||
<!ENTITY micro "µ" ><!-- micro sign, U+00B5 ISOnum --> |
||||
<!ENTITY para "¶" ><!-- pilcrow sign = paragraph sign, U+00B6 ISOnum --> |
||||
<!ENTITY middot "·" ><!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum --> |
||||
<!ENTITY cedil "¸" ><!-- cedilla = spacing cedilla, U+00B8 ISOdia --> |
||||
<!ENTITY sup1 "¹" ><!-- superscript one = superscript digit one, U+00B9 ISOnum --> |
||||
<!ENTITY ordm "º" ><!-- masculine ordinal indicator, U+00BA ISOnum --> |
||||
<!ENTITY raquo "»" ><!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum --> |
||||
<!ENTITY frac14 "¼" ><!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum --> |
||||
<!ENTITY frac12 "½" ><!-- vulgar fraction one half = fraction one half, U+00BD ISOnum --> |
||||
<!ENTITY frac34 "¾" ><!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum --> |
||||
<!ENTITY iquest "¿" ><!-- inverted question mark = turned question mark, U+00BF ISOnum --> |
||||
<!ENTITY Agrave "À" ><!-- latin capital A with grave = latin capital A grave, U+00C0 ISOlat1 --> |
||||
<!ENTITY Aacute "Á" ><!-- latin capital A with acute, U+00C1 ISOlat1 --> |
||||
<!ENTITY Acirc "Â" ><!-- latin capital A with circumflex, U+00C2 ISOlat1 --> |
||||
<!ENTITY Atilde "Ã" ><!-- latin capital A with tilde, U+00C3 ISOlat1 --> |
||||
<!ENTITY Auml "Ä" ><!-- latin capital A with diaeresis, U+00C4 ISOlat1 --> |
||||
<!ENTITY Aring "Å" ><!-- latin capital A with ring above = latin capital A ring, U+00C5 ISOlat1 --> |
||||
<!ENTITY AElig "Æ" ><!-- latin capital AE = latin capital ligature AE, U+00C6 ISOlat1 --> |
||||
<!ENTITY Ccedil "Ç" ><!-- latin capital C with cedilla, U+00C7 ISOlat1 --> |
||||
<!ENTITY Egrave "È" ><!-- latin capital E with grave, U+00C8 ISOlat1 --> |
||||
<!ENTITY Eacute "É" ><!-- latin capital E with acute, U+00C9 ISOlat1 --> |
||||
<!ENTITY Ecirc "Ê" ><!-- latin capital E with circumflex, U+00CA ISOlat1 --> |
||||
<!ENTITY Euml "Ë" ><!-- latin capital E with diaeresis, U+00CB ISOlat1 --> |
||||
<!ENTITY Igrave "Ì" ><!-- latin capital I with grave, U+00CC ISOlat1 --> |
||||
<!ENTITY Iacute "Í" ><!-- latin capital I with acute, U+00CD ISOlat1 --> |
||||
<!ENTITY Icirc "Î" ><!-- latin capital I with circumflex, U+00CE ISOlat1 --> |
||||
<!ENTITY Iuml "Ï" ><!-- latin capital I with diaeresis, U+00CF ISOlat1 --> |
||||
<!ENTITY ETH "Ð" ><!-- latin capital ETH, U+00D0 ISOlat1 --> |
||||
<!ENTITY Ntilde "Ñ" ><!-- latin capital N with tilde, U+00D1 ISOlat1 --> |
||||
<!ENTITY Ograve "Ò" ><!-- latin capital O with grave, U+00D2 ISOlat1 --> |
||||
<!ENTITY Oacute "Ó" ><!-- latin capital O with acute, U+00D3 ISOlat1 --> |
||||
<!ENTITY Ocirc "Ô" ><!-- latin capital O with circumflex, U+00D4 ISOlat1 --> |
||||
<!ENTITY Otilde "Õ" ><!-- latin capital O with tilde, U+00D5 ISOlat1 --> |
||||
<!ENTITY Ouml "Ö" ><!-- latin capital O with diaeresis, U+00D6 ISOlat1 --> |
||||
<!ENTITY times "×" ><!-- multiplication sign, U+00D7 ISOnum --> |
||||
<!ENTITY Oslash "Ø" ><!-- latin capital O with stroke = latin capital O slash, U+00D8 ISOlat1 --> |
||||
<!ENTITY Ugrave "Ù" ><!-- latin capital U with grave, U+00D9 ISOlat1 --> |
||||
<!ENTITY Uacute "Ú" ><!-- latin capital U with acute, U+00DA ISOlat1 --> |
||||
<!ENTITY Ucirc "Û" ><!-- latin capital U with circumflex, U+00DB ISOlat1 --> |
||||
<!ENTITY Uuml "Ü" ><!-- latin capital U with diaeresis, U+00DC ISOlat1 --> |
||||
<!ENTITY Yacute "Ý" ><!-- latin capital Y with acute, U+00DD ISOlat1 --> |
||||
<!ENTITY THORN "Þ" ><!-- latin capital THORN, U+00DE ISOlat1 --> |
||||
<!ENTITY szlig "ß" ><!-- latin small sharp s = ess-zed, U+00DF ISOlat1 --> |
||||
<!ENTITY agrave "à" ><!-- latin small a with grave = latin small a grave, U+00E0 ISOlat1 --> |
||||
<!ENTITY aacute "á" ><!-- latin small a with acute, U+00E1 ISOlat1 --> |
||||
<!ENTITY acirc "â" ><!-- latin small a with circumflex, U+00E2 ISOlat1 --> |
||||
<!ENTITY atilde "ã" ><!-- latin small a with tilde, U+00E3 ISOlat1 --> |
||||
<!ENTITY auml "ä" ><!-- latin small a with diaeresis, U+00E4 ISOlat1 --> |
||||
<!ENTITY aring "å" ><!-- latin small a with ring above = latin small a ring, U+00E5 ISOlat1 --> |
||||
<!ENTITY aelig "æ" ><!-- latin small ae = latin small ligature ae, U+00E6 ISOlat1 --> |
||||
<!ENTITY ccedil "ç" ><!-- latin small c with cedilla, U+00E7 ISOlat1 --> |
||||
<!ENTITY egrave "è" ><!-- latin small e with grave, U+00E8 ISOlat1 --> |
||||
<!ENTITY eacute "é" ><!-- latin small e with acute, U+00E9 ISOlat1 --> |
||||
<!ENTITY ecirc "ê" ><!-- latin small e with circumflex, U+00EA ISOlat1 --> |
||||
<!ENTITY euml "ë" ><!-- latin small e with diaeresis, U+00EB ISOlat1 --> |
||||
<!ENTITY igrave "ì" ><!-- latin small i with grave, U+00EC ISOlat1 --> |
||||
<!ENTITY iacute "í" ><!-- latin small i with acute, U+00ED ISOlat1 --> |
||||
<!ENTITY icirc "î" ><!-- latin small i with circumflex, U+00EE ISOlat1 --> |
||||
<!ENTITY iuml "ï" ><!-- latin small i with diaeresis, U+00EF ISOlat1 --> |
||||
<!ENTITY eth "ð" ><!-- latin small eth, U+00F0 ISOlat1 --> |
||||
<!ENTITY ntilde "ñ" ><!-- latin small n with tilde, U+00F1 ISOlat1 --> |
||||
<!ENTITY ograve "ò" ><!-- latin small o with grave, U+00F2 ISOlat1 --> |
||||
<!ENTITY oacute "ó" ><!-- latin small o with acute, U+00F3 ISOlat1 --> |
||||
<!ENTITY ocirc "ô" ><!-- latin small o with circumflex, U+00F4 ISOlat1 --> |
||||
<!ENTITY otilde "õ" ><!-- latin small o with tilde, U+00F5 ISOlat1 --> |
||||
<!ENTITY ouml "ö" ><!-- latin small o with diaeresis, U+00F6 ISOlat1 --> |
||||
<!ENTITY divide "÷" ><!-- division sign, U+00F7 ISOnum --> |
||||
<!ENTITY oslash "ø" ><!-- latin small o with stroke, = latin small o slash, U+00F8 ISOlat1 --> |
||||
<!ENTITY ugrave "ù" ><!-- latin small u with grave, U+00F9 ISOlat1 --> |
||||
<!ENTITY uacute "ú" ><!-- latin small u with acute, U+00FA ISOlat1 --> |
||||
<!ENTITY ucirc "û" ><!-- latin small u with circumflex, U+00FB ISOlat1 --> |
||||
<!ENTITY uuml "ü" ><!-- latin small u with diaeresis, U+00FC ISOlat1 --> |
||||
<!ENTITY yacute "ý" ><!-- latin small y with acute, U+00FD ISOlat1 --> |
||||
<!ENTITY thorn "þ" ><!-- latin small thorn with, U+00FE ISOlat1 --> |
||||
<!ENTITY yuml "ÿ" ><!-- latin small y with diaeresis, U+00FF ISOlat1 --> |
||||
<!-- end of xhtml-lat1.ent --> |
||||
|
||||
F.1.2. XHTML Special Characters |
||||
|
||||
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-special.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-special.ent. |
||||
|
||||
<!-- ...................................................................... --> |
||||
<!-- XML-compatible ISO Special Character Entity Set for XHTML ............ --> |
||||
<!-- file: xhtml-special.ent |
||||
|
||||
Typical invocation: |
||||
|
||||
<!ENTITY % xhtml-special |
||||
PUBLIC "-//W3C//ENTITIES Special for XHTML//EN" |
||||
"xhtml-special.ent" > |
||||
%xhtml-special; |
||||
|
||||
This DTD module is identified by the PUBLIC and SYSTEM identifiers: |
||||
|
||||
PUBLIC "-//W3C//ENTITIES Special for XHTML//EN" |
||||
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-special.ent" |
||||
|
||||
Revision: $Id: xhtml-special.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI |
||||
|
||||
Portions (C) International Organization for Standardization 1986: |
||||
Permission to copy in any form is granted for use with conforming |
||||
SGML systems and applications as defined in ISO 8879, provided |
||||
this notice is included in all copies. |
||||
|
||||
Revisions: |
||||
2000-10-28: added ' and altered XML Predefined Entities for compatibility |
||||
--> |
||||
|
||||
<!-- Relevant ISO entity set is given unless names are newly introduced. |
||||
New names (i.e., not in ISO 8879 [SGML] list) do not clash with |
||||
any existing ISO 8879 entity names. ISO 10646 [ISO10646] character |
||||
numbers are given for each character, in hex. Entity values are |
||||
decimal conversions of the ISO 10646 values and refer to the |
||||
document character set. Names are Unicode [UNICODE] names. |
||||
--> |
||||
|
||||
<!-- C0 Controls and Basic Latin --> |
||||
<!ENTITY lt "&#60;" ><!-- less-than sign, U+003C ISOnum --> |
||||
<!ENTITY gt ">" ><!-- greater-than sign, U+003E ISOnum --> |
||||
<!ENTITY amp "&#38;" ><!-- ampersand, U+0026 ISOnum --> |
||||
<!ENTITY apos "'" ><!-- The Apostrophe (Apostrophe Quote, APL Quote), U+0027 ISOnum --> |
||||
<!ENTITY quot """ ><!-- quotation mark (Quote Double), U+0022 ISOnum --> |
||||
|
||||
<!-- Latin Extended-A --> |
||||
<!ENTITY OElig "Œ" ><!-- latin capital ligature OE, U+0152 ISOlat2 --> |
||||
<!ENTITY oelig "œ" ><!-- latin small ligature oe, U+0153 ISOlat2 --> |
||||
|
||||
<!-- ligature is a misnomer, this is a separate character in some languages --> |
||||
<!ENTITY Scaron "Š" ><!-- latin capital letter S with caron, U+0160 ISOlat2 --> |
||||
<!ENTITY scaron "š" ><!-- latin small letter s with caron, U+0161 ISOlat2 --> |
||||
<!ENTITY Yuml "Ÿ" ><!-- latin capital letter Y with diaeresis, U+0178 ISOlat2 --> |
||||
|
||||
<!-- Spacing Modifier Letters --> |
||||
<!ENTITY circ "ˆ" ><!-- modifier letter circumflex accent, U+02C6 ISOpub --> |
||||
<!ENTITY tilde "˜" ><!-- small tilde, U+02DC ISOdia --> |
||||
|
||||
<!-- General Punctuation --> |
||||
<!ENTITY ensp " " ><!-- en space, U+2002 ISOpub --> |
||||
<!ENTITY emsp " " ><!-- em space, U+2003 ISOpub --> |
||||
<!ENTITY thinsp " " ><!-- thin space, U+2009 ISOpub --> |
||||
<!ENTITY zwnj "‌" ><!-- zero width non-joiner, U+200C NEW RFC 2070 --> |
||||
<!ENTITY zwj "‍" ><!-- zero width joiner, U+200D NEW RFC 2070 --> |
||||
<!ENTITY lrm "‎" ><!-- left-to-right mark, U+200E NEW RFC 2070 --> |
||||
<!ENTITY rlm "‏" ><!-- right-to-left mark, U+200F NEW RFC 2070 --> |
||||
<!ENTITY ndash "–" ><!-- en dash, U+2013 ISOpub --> |
||||
<!ENTITY mdash "—" ><!-- em dash, U+2014 ISOpub --> |
||||
<!ENTITY lsquo "‘" ><!-- left single quotation mark, U+2018 ISOnum --> |
||||
<!ENTITY rsquo "’" ><!-- right single quotation mark, U+2019 ISOnum --> |
||||
<!ENTITY sbquo "‚" ><!-- single low-9 quotation mark, U+201A NEW --> |
||||
<!ENTITY ldquo "“" ><!-- left double quotation mark, U+201C ISOnum --> |
||||
<!ENTITY rdquo "”" ><!-- right double quotation mark, U+201D ISOnum --> |
||||
<!ENTITY bdquo "„" ><!-- double low-9 quotation mark, U+201E NEW --> |
||||
<!ENTITY dagger "†" ><!-- dagger, U+2020 ISOpub --> |
||||
<!ENTITY Dagger "‡" ><!-- double dagger, U+2021 ISOpub --> |
||||
<!ENTITY permil "‰" ><!-- per mille sign, U+2030 ISOtech --> |
||||
|
||||
<!-- lsaquo is proposed but not yet ISO standardized --> |
||||
<!ENTITY lsaquo "‹" ><!-- single left-pointing angle quotation mark, U+2039 ISO proposed --> |
||||
<!-- rsaquo is proposed but not yet ISO standardized --> |
||||
<!ENTITY rsaquo "›" ><!-- single right-pointing angle quotation mark, U+203A ISO proposed --> |
||||
<!ENTITY euro "€" ><!-- euro sign, U+20AC NEW --> |
||||
|
||||
<!-- end of xhtml-special.ent --> |
||||
|
||||
F.1.3. XHTML Mathematical, Greek, and Symbolic Characters |
||||
|
||||
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-symbol.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent. |
||||
|
||||
<!-- ...................................................................... --> |
||||
<!-- ISO Math, Greek and Symbolic Character Entity Set for XHTML .......... --> |
||||
<!-- file: xhtml-symbol.ent |
||||
|
||||
Typical invocation: |
||||
|
||||
<!ENTITY % xhtml-symbol |
||||
PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN" |
||||
"xhtml-symbol.ent" > |
||||
%xhtml-symbol; |
||||
|
||||
This DTD module is identified by the PUBLIC and SYSTEM identifiers: |
||||
|
||||
PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN" |
||||
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent" |
||||
|
||||
Revision: $Id: xhtml-symbol.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI |
||||
|
||||
Portions (C) International Organization for Standardization 1986: |
||||
Permission to copy in any form is granted for use with conforming |
||||
SGML systems and applications as defined in ISO 8879, provided |
||||
this notice is included in all copies. |
||||
--> |
||||
|
||||
<!-- Relevant ISO entity set is given unless names are newly introduced. |
||||
New names (i.e., not in ISO 8879 [SGML] list) do not clash with |
||||
any existing ISO 8879 entity names. ISO 10646 [ISO10646] character |
||||
numbers are given for each character, in hex. Entity values are |
||||
decimal conversions of the ISO 10646 values and refer to the |
||||
document character set. Names are Unicode [UNICODE] names. |
||||
--> |
||||
|
||||
<!-- Latin Extended-B --> |
||||
<!ENTITY fnof "ƒ" ><!-- latin small f with hook = function |
||||
= florin, U+0192 ISOtech --> |
||||
|
||||
<!-- Greek --> |
||||
<!ENTITY Alpha "Α" ><!-- greek capital letter alpha, U+0391 --> |
||||
<!ENTITY Beta "Β" ><!-- greek capital letter beta, U+0392 --> |
||||
<!ENTITY Gamma "Γ" ><!-- greek capital letter gamma, U+0393 ISOgrk3 --> |
||||
<!ENTITY Delta "Δ" ><!-- greek capital letter delta, U+0394 ISOgrk3 --> |
||||
<!ENTITY Epsilon "Ε" ><!-- greek capital letter epsilon, U+0395 --> |
||||
<!ENTITY Zeta "Ζ" ><!-- greek capital letter zeta, U+0396 --> |
||||
<!ENTITY Eta "Η" ><!-- greek capital letter eta, U+0397 --> |
||||
<!ENTITY Theta "Θ" ><!-- greek capital letter theta, U+0398 ISOgrk3 --> |
||||
<!ENTITY Iota "Ι" ><!-- greek capital letter iota, U+0399 --> |
||||
<!ENTITY Kappa "Κ" ><!-- greek capital letter kappa, U+039A --> |
||||
<!ENTITY Lambda "Λ" ><!-- greek capital letter lambda, U+039B ISOgrk3 --> |
||||
<!ENTITY Mu "Μ" ><!-- greek capital letter mu, U+039C --> |
||||
<!ENTITY Nu "Ν" ><!-- greek capital letter nu, U+039D --> |
||||
<!ENTITY Xi "Ξ" ><!-- greek capital letter xi, U+039E ISOgrk3 --> |
||||
<!ENTITY Omicron "Ο" ><!-- greek capital letter omicron, U+039F --> |
||||
<!ENTITY Pi "Π" ><!-- greek capital letter pi, U+03A0 ISOgrk3 --> |
||||
<!ENTITY Rho "Ρ" ><!-- greek capital letter rho, U+03A1 --> |
||||
<!-- there is no Sigmaf, and no U+03A2 character either --> |
||||
<!ENTITY Sigma "Σ" ><!-- greek capital letter sigma, U+03A3 ISOgrk3 --> |
||||
<!ENTITY Tau "Τ" ><!-- greek capital letter tau, U+03A4 --> |
||||
<!ENTITY Upsilon "Υ" ><!-- greek capital letter upsilon, |
||||
U+03A5 ISOgrk3 --> |
||||
<!ENTITY Phi "Φ" ><!-- greek capital letter phi, U+03A6 ISOgrk3 --> |
||||
<!ENTITY Chi "Χ" ><!-- greek capital letter chi, U+03A7 --> |
||||
<!ENTITY Psi "Ψ" ><!-- greek capital letter psi, U+03A8 ISOgrk3 --> |
||||
<!ENTITY Omega "Ω" ><!-- greek capital letter omega, U+03A9 ISOgrk3 --> |
||||
<!ENTITY alpha "α" ><!-- greek small letter alpha, U+03B1 ISOgrk3 --> |
||||
<!ENTITY beta "β" ><!-- greek small letter beta, U+03B2 ISOgrk3 --> |
||||
<!ENTITY gamma "γ" ><!-- greek small letter gamma, U+03B3 ISOgrk3 --> |
||||
<!ENTITY delta "δ" ><!-- greek small letter delta, U+03B4 ISOgrk3 --> |
||||
<!ENTITY epsilon "ε" ><!-- greek small letter epsilon, U+03B5 ISOgrk3 --> |
||||
<!ENTITY zeta "ζ" ><!-- greek small letter zeta, U+03B6 ISOgrk3 --> |
||||
<!ENTITY eta "η" ><!-- greek small letter eta, U+03B7 ISOgrk3 --> |
||||
<!ENTITY theta "θ" ><!-- greek small letter theta, U+03B8 ISOgrk3 --> |
||||
<!ENTITY iota "ι" ><!-- greek small letter iota, U+03B9 ISOgrk3 --> |
||||
<!ENTITY kappa "κ" ><!-- greek small letter kappa, U+03BA ISOgrk3 --> |
||||
<!ENTITY lambda "λ" ><!-- greek small letter lambda, U+03BB ISOgrk3 --> |
||||
<!ENTITY mu "μ" ><!-- greek small letter mu, U+03BC ISOgrk3 --> |
||||
<!ENTITY nu "ν" ><!-- greek small letter nu, U+03BD ISOgrk3 --> |
||||
<!ENTITY xi "ξ" ><!-- greek small letter xi, U+03BE ISOgrk3 --> |
||||
<!ENTITY omicron "ο" ><!-- greek small letter omicron, U+03BF NEW --> |
||||
<!ENTITY pi "π" ><!-- greek small letter pi, U+03C0 ISOgrk3 --> |
||||
<!ENTITY rho "ρ" ><!-- greek small letter rho, U+03C1 ISOgrk3 --> |
||||
<!ENTITY sigmaf "ς" ><!-- greek small letter final sigma, U+03C2 ISOgrk3 --> |
||||
<!ENTITY sigma "σ" ><!-- greek small letter sigma, U+03C3 ISOgrk3 --> |
||||
<!ENTITY tau "τ" ><!-- greek small letter tau, U+03C4 ISOgrk3 --> |
||||
<!ENTITY upsilon "υ" ><!-- greek small letter upsilon, U+03C5 ISOgrk3 --> |
||||
<!ENTITY phi "φ" ><!-- greek small letter phi, U+03C6 ISOgrk3 --> |
||||
<!ENTITY chi "χ" ><!-- greek small letter chi, U+03C7 ISOgrk3 --> |
||||
<!ENTITY psi "ψ" ><!-- greek small letter psi, U+03C8 ISOgrk3 --> |
||||
<!ENTITY omega "ω" ><!-- greek small letter omega, U+03C9 ISOgrk3 --> |
||||
<!ENTITY thetasym "ϑ" ><!-- greek small letter theta symbol, U+03D1 NEW --> |
||||
<!ENTITY upsih "ϒ" ><!-- greek upsilon with hook symbol, U+03D2 NEW --> |
||||
<!ENTITY piv "ϖ" ><!-- greek pi symbol, U+03D6 ISOgrk3 --> |
||||
|
||||
<!-- General Punctuation --> |
||||
<!ENTITY bull "•" ><!-- bullet = black small circle, U+2022 ISOpub --> |
||||
<!-- bullet is NOT the same as bullet operator, U+2219 --> |
||||
<!ENTITY hellip "…" ><!-- horizontal ellipsis = three dot leader, U+2026 ISOpub --> |
||||
<!ENTITY prime "′" ><!-- prime = minutes = feet, U+2032 ISOtech --> |
||||
<!ENTITY Prime "″" ><!-- double prime = seconds = inches, U+2033 ISOtech --> |
||||
<!ENTITY oline "‾" ><!-- overline = spacing overscore, U+203E NEW --> |
||||
<!ENTITY frasl "⁄" ><!-- fraction slash, U+2044 NEW --> |
||||
|
||||
<!-- Letterlike Symbols --> |
||||
<!ENTITY weierp "℘" ><!-- script capital P = power set = Weierstrass p, U+2118 ISOamso --> |
||||
<!ENTITY image "ℑ" ><!-- blackletter capital I = imaginary part, U+2111 ISOamso --> |
||||
<!ENTITY real "ℜ" ><!-- blackletter capital R = real part symbol, U+211C ISOamso --> |
||||
<!ENTITY trade "™" ><!-- trade mark sign, U+2122 ISOnum --> |
||||
<!ENTITY alefsym "ℵ" ><!-- alef symbol = first transfinite cardinal, U+2135 NEW --> |
||||
<!-- alef symbol is NOT the same as hebrew letter alef, U+05D0 although |
||||
the same glyph could be used to depict both characters --> |
||||
|
||||
<!-- Arrows --> |
||||
<!ENTITY larr "←" ><!-- leftwards arrow, U+2190 ISOnum --> |
||||
<!ENTITY uarr "↑" ><!-- upwards arrow, U+2191 ISOnum--> |
||||
<!ENTITY rarr "→" ><!-- rightwards arrow, U+2192 ISOnum --> |
||||
<!ENTITY darr "↓" ><!-- downwards arrow, U+2193 ISOnum --> |
||||
<!ENTITY harr "↔" ><!-- left right arrow, U+2194 ISOamsa --> |
||||
<!ENTITY crarr "↵" ><!-- downwards arrow with corner leftwards |
||||
= carriage return, U+21B5 NEW --> |
||||
<!ENTITY lArr "⇐" ><!-- leftwards double arrow, U+21D0 ISOtech --> |
||||
<!-- Unicode does not say that lArr is the same as the 'is implied by' arrow |
||||
but also does not have any other character for that function. So ? lArr can |
||||
be used for 'is implied by' as ISOtech suggests --> |
||||
<!ENTITY uArr "⇑" ><!-- upwards double arrow, U+21D1 ISOamsa --> |
||||
<!ENTITY rArr "⇒" ><!-- rightwards double arrow, U+21D2 ISOtech --> |
||||
<!-- Unicode does not say this is the 'implies' character but does not have |
||||
another character with this function so ? |
||||
rArr can be used for 'implies' as ISOtech suggests --> |
||||
<!ENTITY dArr "⇓" ><!-- downwards double arrow, U+21D3 ISOamsa --> |
||||
<!ENTITY hArr "⇔" ><!-- left right double arrow, U+21D4 ISOamsa --> |
||||
|
||||
<!-- Mathematical Operators --> |
||||
<!ENTITY forall "∀" ><!-- for all, U+2200 ISOtech --> |
||||
<!ENTITY part "∂" ><!-- partial differential, U+2202 ISOtech --> |
||||
<!ENTITY exist "∃" ><!-- there exists, U+2203 ISOtech --> |
||||
<!ENTITY empty "∅" ><!-- empty set = null set, U+2205 ISOamso --> |
||||
<!ENTITY nabla "∇" ><!-- nabla = backward difference, U+2207 ISOtech --> |
||||
<!ENTITY isin "∈" ><!-- element of, U+2208 ISOtech --> |
||||
<!ENTITY notin "∉" ><!-- not an element of, U+2209 ISOtech --> |
||||
<!ENTITY ni "∋" ><!-- contains as member, U+220B ISOtech --> |
||||
<!-- should there be a more memorable name than 'ni'? --> |
||||
<!ENTITY prod "∏" ><!-- n-ary product = product sign, U+220F ISOamsb --> |
||||
<!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though |
||||
the same glyph might be used for both --> |
||||
<!ENTITY sum "∑" ><!-- n-ary sumation, U+2211 ISOamsb --> |
||||
<!-- sum is NOT the same character as U+03A3 'greek capital letter sigma' |
||||
though the same glyph might be used for both --> |
||||
<!ENTITY minus "−" ><!-- minus sign, U+2212 ISOtech --> |
||||
<!ENTITY lowast "∗" ><!-- asterisk operator, U+2217 ISOtech --> |
||||
<!ENTITY radic "√" ><!-- square root = radical sign, U+221A ISOtech --> |
||||
<!ENTITY prop "∝" ><!-- proportional to, U+221D ISOtech --> |
||||
<!ENTITY infin "∞" ><!-- infinity, U+221E ISOtech --> |
||||
<!ENTITY ang "∠" ><!-- angle, U+2220 ISOamso --> |
||||
<!ENTITY and "∧" ><!-- logical and = wedge, U+2227 ISOtech --> |
||||
<!ENTITY or "∨" ><!-- logical or = vee, U+2228 ISOtech --> |
||||
<!ENTITY cap "∩" ><!-- intersection = cap, U+2229 ISOtech --> |
||||
<!ENTITY cup "∪" ><!-- union = cup, U+222A ISOtech --> |
||||
<!ENTITY int "∫" ><!-- integral, U+222B ISOtech --> |
||||
<!ENTITY there4 "∴" ><!-- therefore, U+2234 ISOtech --> |
||||
<!ENTITY sim "∼" ><!-- tilde operator = varies with = similar to, U+223C ISOtech --> |
||||
<!-- tilde operator is NOT the same character as the tilde, U+007E, |
||||
although the same glyph might be used to represent both --> |
||||
<!ENTITY cong "≅" ><!-- approximately equal to, U+2245 ISOtech --> |
||||
<!ENTITY asymp "≈" ><!-- almost equal to = asymptotic to, U+2248 ISOamsr --> |
||||
<!ENTITY ne "≠" ><!-- not equal to, U+2260 ISOtech --> |
||||
<!ENTITY equiv "≡" ><!-- identical to, U+2261 ISOtech --> |
||||
<!ENTITY le "≤" ><!-- less-than or equal to, U+2264 ISOtech --> |
||||
<!ENTITY ge "≥" ><!-- greater-than or equal to, U+2265 ISOtech --> |
||||
<!ENTITY sub "⊂" ><!-- subset of, U+2282 ISOtech --> |
||||
<!ENTITY sup "⊃" ><!-- superset of, U+2283 ISOtech --> |
||||
<!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol |
||||
font encoding and is not included. Should it be, for symmetry? |
||||
It is in ISOamsn --> |
||||
<!ENTITY nsub "⊄" ><!-- not a subset of, U+2284 ISOamsn --> |
||||
<!ENTITY sube "⊆" ><!-- subset of or equal to, U+2286 ISOtech --> |
||||
<!ENTITY supe "⊇" ><!-- superset of or equal to, U+2287 ISOtech --> |
||||
<!ENTITY oplus "⊕" ><!-- circled plus = direct sum, U+2295 ISOamsb --> |
||||
<!ENTITY otimes "⊗" ><!-- circled times = vector product, U+2297 ISOamsb --> |
||||
<!ENTITY perp "⊥" ><!-- up tack = orthogonal to = perpendicular, U+22A5 ISOtech --> |
||||
<!ENTITY sdot "⋅" ><!-- dot operator, U+22C5 ISOamsb --> |
||||
<!-- dot operator is NOT the same character as U+00B7 middle dot --> |
||||
|
||||
<!-- Miscellaneous Technical --> |
||||
<!ENTITY lceil "⌈" ><!-- left ceiling = apl upstile, U+2308 ISOamsc --> |
||||
<!ENTITY rceil "⌉" ><!-- right ceiling, U+2309 ISOamsc --> |
||||
<!ENTITY lfloor "⌊" ><!-- left floor = apl downstile, U+230A ISOamsc --> |
||||
<!ENTITY rfloor "⌋" ><!-- right floor, U+230B ISOamsc --> |
||||
<!ENTITY lang "〈" ><!-- left-pointing angle bracket = bra, U+2329 ISOtech --> |
||||
<!-- lang is NOT the same character as U+003C 'less than' |
||||
or U+2039 'single left-pointing angle quotation mark' --> |
||||
<!ENTITY rang "〉" ><!-- right-pointing angle bracket = ket, U+232A ISOtech --> |
||||
<!-- rang is NOT the same character as U+003E 'greater than' |
||||
or U+203A 'single right-pointing angle quotation mark' --> |
||||
|
||||
<!-- Geometric Shapes --> |
||||
<!ENTITY loz "◊" ><!-- lozenge, U+25CA ISOpub --> |
||||
|
||||
<!-- Miscellaneous Symbols --> |
||||
<!ENTITY spades "♠" ><!-- black spade suit, U+2660 ISOpub --> |
||||
<!-- black here seems to mean filled as opposed to hollow --> |
||||
<!ENTITY clubs "♣" ><!-- black club suit = shamrock, U+2663 ISOpub --> |
||||
<!ENTITY hearts "♥" ><!-- black heart suit = valentine, U+2665 ISOpub --> |
||||
<!ENTITY diams "♦" ><!-- black diamond suit, U+2666 ISOpub --> |
||||
|
||||
<!-- end of xhtml-symbol.ent --> |
||||
""" |
||||
return text |
||||
|
||||
def get_apache_license(): |
||||
license = r"""/** |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0 |
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
""" |
||||
return license |
||||
|
||||
main() |
@ -0,0 +1,61 @@
|
||||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> |
||||
<!-- |
||||
Licensed to the Apache Software Foundation (ASF) under one or more |
||||
contributor license agreements. See the NOTICE file distributed with |
||||
this work for additional information regarding copyright ownership. |
||||
The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
(the "License"); you may not use this file except in compliance with |
||||
the License. You may obtain a copy of the License at |
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
||||
Unless required by applicable law or agreed to in writing, software |
||||
distributed under the License is distributed on an "AS IS" BASIS, |
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
See the License for the specific language governing permissions and |
||||
limitations under the License. |
||||
--> |
||||
<html><head></head> |
||||
<body> |
||||
<p> |
||||
Normalization of text before the tokenizer. |
||||
</p> |
||||
<p> |
||||
CharFilters are chainable filters that normalize text before tokenization |
||||
and provide mappings between normalized text offsets and the corresponding |
||||
offset in the original text. |
||||
</p> |
||||
<H2>CharFilter offset mappings</H2> |
||||
<p> |
||||
CharFilters modify an input stream via a series of substring |
||||
replacements (including deletions and insertions) to produce an output |
||||
stream. There are three possible replacement cases: the replacement |
||||
string has the same length as the original substring; the replacement |
||||
is shorter; and the replacement is longer. In the latter two cases |
||||
(when the replacement has a different length than the original), |
||||
one or more offset correction mappings are required. |
||||
</p> |
||||
<p> |
||||
When the replacement is shorter than the original (e.g. when the |
||||
replacement is the empty string), a single offset correction mapping |
||||
should be added at the replacement's end offset in the output stream. |
||||
The <code>cumulativeDiff</code> parameter to the |
||||
<code>addOffCorrectMapping()</code> method will be the sum of all |
||||
previous replacement offset adjustments, with the addition of the |
||||
difference between the lengths of the original substring and the |
||||
replacement string (a positive value). |
||||
</p> |
||||
<p> |
||||
When the replacement is longer than the original (e.g. when the |
||||
original is the empty string), you should add as many offset |
||||
correction mappings as the difference between the lengths of the |
||||
replacement string and the original substring, starting at the |
||||
end offset the original substring would have had in the output stream. |
||||
The <code>cumulativeDiff</code> parameter to the |
||||
<code>addOffCorrectMapping()</code> method will be the sum of all |
||||
previous replacement offset adjustments, with the addition of the |
||||
difference between the lengths of the original substring and the |
||||
replacement string so far (a negative value). |
||||
</p> |
||||
</body> |
||||
</html> |
@ -0,0 +1,104 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.cjk; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.IOException; |
||||
import java.io.Reader; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.Analyzer; |
||||
import com.fr.third.org.apache.lucene.analysis.TokenStream; |
||||
import com.fr.third.org.apache.lucene.analysis.Tokenizer; |
||||
import com.fr.third.org.apache.lucene.analysis.core.LowerCaseFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.core.StopFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer; |
||||
import com.fr.third.org.apache.lucene.analysis.util.CharArraySet; |
||||
import com.fr.third.org.apache.lucene.analysis.util.StopwordAnalyzerBase; |
||||
import com.fr.third.org.apache.lucene.util.Version; |
||||
|
||||
/** |
||||
* An {@link Analyzer} that tokenizes text with {@link StandardTokenizer}, |
||||
* normalizes content with {@link CJKWidthFilter}, folds case with |
||||
* {@link LowerCaseFilter}, forms bigrams of CJK with {@link CJKBigramFilter}, |
||||
* and filters stopwords with {@link StopFilter} |
||||
*/ |
||||
public final class CJKAnalyzer extends StopwordAnalyzerBase { |
||||
/** |
||||
* File containing default CJK stopwords. |
||||
* <p/> |
||||
* Currently it contains some common English words that are not usually |
||||
* useful for searching and some double-byte interpunctions. |
||||
*/ |
||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; |
||||
|
||||
/** |
||||
* Returns an unmodifiable instance of the default stop-words set. |
||||
* @return an unmodifiable instance of the default stop-words set. |
||||
*/ |
||||
public static CharArraySet getDefaultStopSet(){ |
||||
return DefaultSetHolder.DEFAULT_STOP_SET; |
||||
} |
||||
|
||||
private static class DefaultSetHolder { |
||||
static final CharArraySet DEFAULT_STOP_SET; |
||||
|
||||
static { |
||||
try { |
||||
DEFAULT_STOP_SET = loadStopwordSet(false, CJKAnalyzer.class, DEFAULT_STOPWORD_FILE, "#"); |
||||
} catch (IOException ex) { |
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new RuntimeException("Unable to load default stopword set"); |
||||
} |
||||
} |
||||
} |
||||
|
||||
/** |
||||
* Builds an analyzer which removes words in {@link #getDefaultStopSet()}. |
||||
*/ |
||||
public CJKAnalyzer(Version matchVersion) { |
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET); |
||||
} |
||||
|
||||
/** |
||||
* Builds an analyzer with the given stop words |
||||
* |
||||
* @param matchVersion |
||||
* lucene compatibility version |
||||
* @param stopwords |
||||
* a stopword set |
||||
*/ |
||||
public CJKAnalyzer(Version matchVersion, CharArraySet stopwords){ |
||||
super(matchVersion, stopwords); |
||||
} |
||||
|
||||
@Override |
||||
protected TokenStreamComponents createComponents(String fieldName, |
||||
Reader reader) { |
||||
if (matchVersion.onOrAfter(Version.LUCENE_36)) { |
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader); |
||||
// run the widthfilter first before bigramming, it sometimes combines characters.
|
||||
TokenStream result = new CJKWidthFilter(source); |
||||
result = new LowerCaseFilter(matchVersion, result); |
||||
result = new CJKBigramFilter(result); |
||||
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords)); |
||||
} else { |
||||
final Tokenizer source = new CJKTokenizer(reader); |
||||
return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords)); |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,363 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.cjk; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.IOException; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.TokenFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.TokenStream; |
||||
import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
||||
import com.fr.third.org.apache.lucene.util.ArrayUtil; |
||||
|
||||
/** |
||||
* Forms bigrams of CJK terms that are generated from StandardTokenizer |
||||
* or ICUTokenizer. |
||||
* <p> |
||||
* CJK types are set by these tokenizers, but you can also use |
||||
* {@link #CJKBigramFilter(TokenStream, int)} to explicitly control which |
||||
* of the CJK scripts are turned into bigrams. |
||||
* <p> |
||||
* By default, when a CJK character has no adjacent characters to form |
||||
* a bigram, it is output in unigram form. If you want to always output |
||||
* both unigrams and bigrams, set the <code>outputUnigrams</code> |
||||
* flag in {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)}. |
||||
* This can be used for a combined unigram+bigram approach. |
||||
* <p> |
||||
* In all cases, all non-CJK input is passed thru unmodified. |
||||
*/ |
||||
public final class CJKBigramFilter extends TokenFilter { |
||||
// configuration
|
||||
/** bigram flag for Han Ideographs */ |
||||
public static final int HAN = 1; |
||||
/** bigram flag for Hiragana */ |
||||
public static final int HIRAGANA = 2; |
||||
/** bigram flag for Katakana */ |
||||
public static final int KATAKANA = 4; |
||||
/** bigram flag for Hangul */ |
||||
public static final int HANGUL = 8; |
||||
|
||||
/** when we emit a bigram, its then marked as this type */ |
||||
public static final String DOUBLE_TYPE = "<DOUBLE>"; |
||||
/** when we emit a unigram, its then marked as this type */ |
||||
public static final String SINGLE_TYPE = "<SINGLE>"; |
||||
|
||||
// the types from standardtokenizer
|
||||
private static final String HAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC]; |
||||
private static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA]; |
||||
private static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA]; |
||||
private static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL]; |
||||
|
||||
// sentinel value for ignoring a script
|
||||
private static final Object NO = new Object(); |
||||
|
||||
// these are set to either their type or NO if we want to pass them thru
|
||||
private final Object doHan; |
||||
private final Object doHiragana; |
||||
private final Object doKatakana; |
||||
private final Object doHangul; |
||||
|
||||
// true if we should output unigram tokens always
|
||||
private final boolean outputUnigrams; |
||||
private boolean ngramState; // false = output unigram, true = output bigram
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); |
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); |
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); |
||||
private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class); |
||||
|
||||
// buffers containing codepoint and offsets in parallel
|
||||
int buffer[] = new int[8]; |
||||
int startOffset[] = new int[8]; |
||||
int endOffset[] = new int[8]; |
||||
// length of valid buffer
|
||||
int bufferLen; |
||||
// current buffer index
|
||||
int index; |
||||
|
||||
// the last end offset, to determine if we should bigram across tokens
|
||||
int lastEndOffset; |
||||
|
||||
private boolean exhausted; |
||||
|
||||
/** |
||||
* Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int) |
||||
* CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)} |
||||
*/ |
||||
public CJKBigramFilter(TokenStream in) { |
||||
this(in, HAN | HIRAGANA | KATAKANA | HANGUL); |
||||
} |
||||
|
||||
/** |
||||
* Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean) |
||||
* CJKBigramFilter(in, flags, false)} |
||||
*/ |
||||
public CJKBigramFilter(TokenStream in, int flags) { |
||||
this(in, flags, false); |
||||
} |
||||
|
||||
/** |
||||
* Create a new CJKBigramFilter, specifying which writing systems should be bigrammed, |
||||
* and whether or not unigrams should also be output. |
||||
* @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA}, |
||||
* {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL} |
||||
* @param outputUnigrams true if unigrams for the selected writing systems should also be output. |
||||
* when this is false, this is only done when there are no adjacent characters to form |
||||
* a bigram. |
||||
*/ |
||||
public CJKBigramFilter(TokenStream in, int flags, boolean outputUnigrams) { |
||||
super(in); |
||||
doHan = (flags & HAN) == 0 ? NO : HAN_TYPE; |
||||
doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE; |
||||
doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE; |
||||
doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE; |
||||
this.outputUnigrams = outputUnigrams; |
||||
} |
||||
|
||||
/* |
||||
* much of this complexity revolves around handling the special case of a |
||||
* "lone cjk character" where cjktokenizer would output a unigram. this |
||||
* is also the only time we ever have to captureState. |
||||
*/ |
||||
@Override |
||||
public boolean incrementToken() throws IOException { |
||||
while (true) { |
||||
if (hasBufferedBigram()) { |
||||
|
||||
// case 1: we have multiple remaining codepoints buffered,
|
||||
// so we can emit a bigram here.
|
||||
|
||||
if (outputUnigrams) { |
||||
|
||||
// when also outputting unigrams, we output the unigram first,
|
||||
// then rewind back to revisit the bigram.
|
||||
// so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
|
||||
// the logic in hasBufferedUnigram ensures we output the C,
|
||||
// even though it did actually have adjacent CJK characters.
|
||||
|
||||
if (ngramState) { |
||||
flushBigram(); |
||||
} else { |
||||
flushUnigram(); |
||||
index--; |
||||
} |
||||
ngramState = !ngramState; |
||||
} else { |
||||
flushBigram(); |
||||
} |
||||
return true; |
||||
} else if (doNext()) { |
||||
|
||||
// case 2: look at the token type. should we form any n-grams?
|
||||
|
||||
String type = typeAtt.type(); |
||||
if (type == doHan || type == doHiragana || type == doKatakana || type == doHangul) { |
||||
|
||||
// acceptable CJK type: we form n-grams from these.
|
||||
// as long as the offsets are aligned, we just add these to our current buffer.
|
||||
// otherwise, we clear the buffer and start over.
|
||||
|
||||
if (offsetAtt.startOffset() != lastEndOffset) { // unaligned, clear queue
|
||||
if (hasBufferedUnigram()) { |
||||
|
||||
// we have a buffered unigram, and we peeked ahead to see if we could form
|
||||
// a bigram, but we can't, because the offsets are unaligned. capture the state
|
||||
// of this peeked data to be revisited next time thru the loop, and dump our unigram.
|
||||
|
||||
loneState = captureState(); |
||||
flushUnigram(); |
||||
return true; |
||||
} |
||||
index = 0; |
||||
bufferLen = 0; |
||||
} |
||||
refill(); |
||||
} else { |
||||
|
||||
// not a CJK type: we just return these as-is.
|
||||
|
||||
if (hasBufferedUnigram()) { |
||||
|
||||
// we have a buffered unigram, and we peeked ahead to see if we could form
|
||||
// a bigram, but we can't, because its not a CJK type. capture the state
|
||||
// of this peeked data to be revisited next time thru the loop, and dump our unigram.
|
||||
|
||||
loneState = captureState(); |
||||
flushUnigram(); |
||||
return true; |
||||
} |
||||
return true; |
||||
} |
||||
} else { |
||||
|
||||
// case 3: we have only zero or 1 codepoints buffered,
|
||||
// so not enough to form a bigram. But, we also have no
|
||||
// more input. So if we have a buffered codepoint, emit
|
||||
// a unigram, otherwise, its end of stream.
|
||||
|
||||
if (hasBufferedUnigram()) { |
||||
flushUnigram(); // flush our remaining unigram
|
||||
return true; |
||||
} |
||||
return false; |
||||
} |
||||
} |
||||
} |
||||
|
||||
private State loneState; // rarely used: only for "lone cjk characters", where we emit unigrams
|
||||
|
||||
/** |
||||
* looks at next input token, returning false is none is available |
||||
*/ |
||||
private boolean doNext() throws IOException { |
||||
if (loneState != null) { |
||||
restoreState(loneState); |
||||
loneState = null; |
||||
return true; |
||||
} else { |
||||
if (exhausted) { |
||||
return false; |
||||
} else if (input.incrementToken()) { |
||||
return true; |
||||
} else { |
||||
exhausted = true; |
||||
return false; |
||||
} |
||||
} |
||||
} |
||||
|
||||
/** |
||||
* refills buffers with new data from the current token. |
||||
*/ |
||||
private void refill() { |
||||
// compact buffers to keep them smallish if they become large
|
||||
// just a safety check, but technically we only need the last codepoint
|
||||
if (bufferLen > 64) { |
||||
int last = bufferLen - 1; |
||||
buffer[0] = buffer[last]; |
||||
startOffset[0] = startOffset[last]; |
||||
endOffset[0] = endOffset[last]; |
||||
bufferLen = 1; |
||||
index -= last; |
||||
} |
||||
|
||||
char termBuffer[] = termAtt.buffer(); |
||||
int len = termAtt.length(); |
||||
int start = offsetAtt.startOffset(); |
||||
int end = offsetAtt.endOffset(); |
||||
|
||||
int newSize = bufferLen + len; |
||||
buffer = ArrayUtil.grow(buffer, newSize); |
||||
startOffset = ArrayUtil.grow(startOffset, newSize); |
||||
endOffset = ArrayUtil.grow(endOffset, newSize); |
||||
lastEndOffset = end; |
||||
|
||||
if (end - start != len) { |
||||
// crazy offsets (modified by synonym or charfilter): just preserve
|
||||
for (int i = 0, cp = 0; i < len; i += Character.charCount(cp)) { |
||||
cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len); |
||||
startOffset[bufferLen] = start; |
||||
endOffset[bufferLen] = end; |
||||
bufferLen++; |
||||
} |
||||
} else { |
||||
// normal offsets
|
||||
for (int i = 0, cp = 0, cpLen = 0; i < len; i += cpLen) { |
||||
cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len); |
||||
cpLen = Character.charCount(cp); |
||||
startOffset[bufferLen] = start; |
||||
start = endOffset[bufferLen] = start + cpLen; |
||||
bufferLen++; |
||||
} |
||||
} |
||||
} |
||||
|
||||
/** |
||||
* Flushes a bigram token to output from our buffer |
||||
* This is the normal case, e.g. ABC -> AB BC |
||||
*/ |
||||
private void flushBigram() { |
||||
clearAttributes(); |
||||
char termBuffer[] = termAtt.resizeBuffer(4); // maximum bigram length in code units (2 supplementaries)
|
||||
int len1 = Character.toChars(buffer[index], termBuffer, 0); |
||||
int len2 = len1 + Character.toChars(buffer[index+1], termBuffer, len1); |
||||
termAtt.setLength(len2); |
||||
offsetAtt.setOffset(startOffset[index], endOffset[index+1]); |
||||
typeAtt.setType(DOUBLE_TYPE); |
||||
// when outputting unigrams, all bigrams are synonyms that span two unigrams
|
||||
if (outputUnigrams) { |
||||
posIncAtt.setPositionIncrement(0); |
||||
posLengthAtt.setPositionLength(2); |
||||
} |
||||
index++; |
||||
} |
||||
|
||||
/** |
||||
* Flushes a unigram token to output from our buffer. |
||||
* This happens when we encounter isolated CJK characters, either the whole |
||||
* CJK string is a single character, or we encounter a CJK character surrounded |
||||
* by space, punctuation, english, etc, but not beside any other CJK. |
||||
*/ |
||||
private void flushUnigram() { |
||||
clearAttributes(); |
||||
char termBuffer[] = termAtt.resizeBuffer(2); // maximum unigram length (2 surrogates)
|
||||
int len = Character.toChars(buffer[index], termBuffer, 0); |
||||
termAtt.setLength(len); |
||||
offsetAtt.setOffset(startOffset[index], endOffset[index]); |
||||
typeAtt.setType(SINGLE_TYPE); |
||||
index++; |
||||
} |
||||
|
||||
/** |
||||
* True if we have multiple codepoints sitting in our buffer |
||||
*/ |
||||
private boolean hasBufferedBigram() { |
||||
return bufferLen - index > 1; |
||||
} |
||||
|
||||
/** |
||||
* True if we have a single codepoint sitting in our buffer, where its future |
||||
* (whether it is emitted as unigram or forms a bigram) depends upon not-yet-seen |
||||
* inputs. |
||||
*/ |
||||
private boolean hasBufferedUnigram() { |
||||
if (outputUnigrams) { |
||||
// when outputting unigrams always
|
||||
return bufferLen - index == 1; |
||||
} else { |
||||
// otherwise its only when we have a lone CJK character
|
||||
return bufferLen == 1 && index == 0; |
||||
} |
||||
} |
||||
|
||||
@Override |
||||
public void reset() throws IOException { |
||||
super.reset(); |
||||
bufferLen = 0; |
||||
index = 0; |
||||
lastEndOffset = 0; |
||||
loneState = null; |
||||
exhausted = false; |
||||
ngramState = false; |
||||
} |
||||
} |
@ -0,0 +1,67 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.cjk; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.util.Map; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.TokenStream; |
||||
import com.fr.third.org.apache.lucene.analysis.cjk.CJKBigramFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory; |
||||
|
||||
/** |
||||
* Factory for {@link CJKBigramFilter}. |
||||
* <pre class="prettyprint" > |
||||
* <fieldType name="text_cjk" class="solr.TextField"> |
||||
* <analyzer> |
||||
* <tokenizer class="solr.StandardTokenizerFactory"/> |
||||
* <filter class="solr.CJKWidthFilterFactory"/> |
||||
* <filter class="solr.LowerCaseFilterFactory"/> |
||||
* <filter class="solr.CJKBigramFilterFactory" |
||||
* han="true" hiragana="true" |
||||
* katakana="true" hangul="true" outputUnigrams="false" /> |
||||
* </analyzer> |
||||
* </fieldType></pre> |
||||
*/ |
||||
public class CJKBigramFilterFactory extends TokenFilterFactory { |
||||
int flags; |
||||
boolean outputUnigrams; |
||||
|
||||
@Override |
||||
public void init(Map<String,String> args) { |
||||
super.init(args); |
||||
flags = 0; |
||||
if (getBoolean("han", true)) { |
||||
flags |= CJKBigramFilter.HAN; |
||||
} |
||||
if (getBoolean("hiragana", true)) { |
||||
flags |= CJKBigramFilter.HIRAGANA; |
||||
} |
||||
if (getBoolean("katakana", true)) { |
||||
flags |= CJKBigramFilter.KATAKANA; |
||||
} |
||||
if (getBoolean("hangul", true)) { |
||||
flags |= CJKBigramFilter.HANGUL; |
||||
} |
||||
outputUnigrams = getBoolean("outputUnigrams", false); |
||||
} |
||||
|
||||
@Override |
||||
public TokenStream create(TokenStream input) { |
||||
return new CJKBigramFilter(input, flags, outputUnigrams); |
||||
} |
||||
} |
@ -0,0 +1,311 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.cjk; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.IOException; |
||||
import java.io.Reader; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.Tokenizer; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
||||
import com.fr.third.org.apache.lucene.util.AttributeSource; |
||||
|
||||
/** |
||||
* CJKTokenizer is designed for Chinese, Japanese, and Korean languages. |
||||
* <p> |
||||
* The tokens returned are every two adjacent characters with overlap match. |
||||
* </p> |
||||
* <p> |
||||
* Example: "java C1C2C3C4" will be segmented to: "java" "C1C2" "C2C3" "C3C4". |
||||
* </p> |
||||
* Additionally, the following is applied to Latin text (such as English): |
||||
* <ul> |
||||
* <li>Text is converted to lowercase. |
||||
* <li>Numeric digits, '+', '#', and '_' are tokenized as letters. |
||||
* <li>Full-width forms are converted to half-width forms. |
||||
* </ul> |
||||
* For more info on Asian language (Chinese, Japanese, and Korean) text segmentation: |
||||
* please search <a |
||||
* href="http://www.google.com/search?q=word+chinese+segment">google</a> |
||||
* |
||||
* @deprecated Use StandardTokenizer, CJKWidthFilter, CJKBigramFilter, and LowerCaseFilter instead. |
||||
*/ |
||||
@Deprecated |
||||
public final class CJKTokenizer extends Tokenizer { |
||||
//~ Static fields/initializers ---------------------------------------------
|
||||
/** Word token type */ |
||||
static final int WORD_TYPE = 0; |
||||
|
||||
/** Single byte token type */ |
||||
static final int SINGLE_TOKEN_TYPE = 1; |
||||
|
||||
/** Double byte token type */ |
||||
static final int DOUBLE_TOKEN_TYPE = 2; |
||||
|
||||
/** Names for token types */ |
||||
static final String[] TOKEN_TYPE_NAMES = { "word", "single", "double" }; |
||||
|
||||
/** Max word length */ |
||||
private static final int MAX_WORD_LEN = 255; |
||||
|
||||
/** buffer size: */ |
||||
private static final int IO_BUFFER_SIZE = 256; |
||||
|
||||
//~ Instance fields --------------------------------------------------------
|
||||
|
||||
/** word offset, used to imply which character(in ) is parsed */ |
||||
private int offset = 0; |
||||
|
||||
/** the index used only for ioBuffer */ |
||||
private int bufferIndex = 0; |
||||
|
||||
/** data length */ |
||||
private int dataLen = 0; |
||||
|
||||
/** |
||||
* character buffer, store the characters which are used to compose <br> |
||||
* the returned Token |
||||
*/ |
||||
private final char[] buffer = new char[MAX_WORD_LEN]; |
||||
|
||||
/** |
||||
* I/O buffer, used to store the content of the input(one of the <br> |
||||
* members of Tokenizer) |
||||
*/ |
||||
private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; |
||||
|
||||
/** word type: single=>ASCII double=>non-ASCII word=>default */ |
||||
private int tokenType = WORD_TYPE; |
||||
|
||||
/** |
||||
* tag: previous character is a cached double-byte character "C1C2C3C4" |
||||
* ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened) |
||||
* C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4" |
||||
*/ |
||||
private boolean preIsTokened = false; |
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); |
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); |
||||
|
||||
//~ Constructors -----------------------------------------------------------
|
||||
|
||||
/** |
||||
* Construct a token stream processing the given input. |
||||
* |
||||
* @param in I/O reader |
||||
*/ |
||||
public CJKTokenizer(Reader in) { |
||||
super(in); |
||||
} |
||||
|
||||
public CJKTokenizer(AttributeSource source, Reader in) { |
||||
super(source, in); |
||||
} |
||||
|
||||
public CJKTokenizer(AttributeFactory factory, Reader in) { |
||||
super(factory, in); |
||||
} |
||||
|
||||
//~ Methods ----------------------------------------------------------------
|
||||
|
||||
/** |
||||
* Returns true for the next token in the stream, or false at EOS. |
||||
* See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
|
||||
* for detail. |
||||
* |
||||
* @return false for end of stream, true otherwise |
||||
* |
||||
* @throws IOException - throw IOException when read error <br> |
||||
* happened in the InputStream |
||||
* |
||||
*/ |
||||
@Override |
||||
public boolean incrementToken() throws IOException { |
||||
clearAttributes(); |
||||
/** how many character(s) has been stored in buffer */ |
||||
|
||||
while(true) { // loop until we find a non-empty token
|
||||
|
||||
int length = 0; |
||||
|
||||
/** the position used to create Token */ |
||||
int start = offset; |
||||
|
||||
while (true) { // loop until we've found a full token
|
||||
/** current character */ |
||||
char c; |
||||
|
||||
/** unicode block of current character for detail */ |
||||
Character.UnicodeBlock ub; |
||||
|
||||
offset++; |
||||
|
||||
if (bufferIndex >= dataLen) { |
||||
dataLen = input.read(ioBuffer); |
||||
bufferIndex = 0; |
||||
} |
||||
|
||||
if (dataLen == -1) { |
||||
if (length > 0) { |
||||
if (preIsTokened == true) { |
||||
length = 0; |
||||
preIsTokened = false; |
||||
} |
||||
else{ |
||||
offset--; |
||||
} |
||||
|
||||
break; |
||||
} else { |
||||
offset--; |
||||
return false; |
||||
} |
||||
} else { |
||||
//get current character
|
||||
c = ioBuffer[bufferIndex++]; |
||||
|
||||
//get the UnicodeBlock of the current character
|
||||
ub = Character.UnicodeBlock.of(c); |
||||
} |
||||
|
||||
//if the current character is ASCII or Extend ASCII
|
||||
if ((ub == Character.UnicodeBlock.BASIC_LATIN) |
||||
|| (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) |
||||
) { |
||||
if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) { |
||||
int i = (int) c; |
||||
if (i >= 65281 && i <= 65374) { |
||||
// convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
|
||||
i = i - 65248; |
||||
c = (char) i; |
||||
} |
||||
} |
||||
|
||||
// if the current character is a letter or "_" "+" "#"
|
||||
if (Character.isLetterOrDigit(c) |
||||
|| ((c == '_') || (c == '+') || (c == '#')) |
||||
) { |
||||
if (length == 0) { |
||||
// "javaC1C2C3C4linux" <br>
|
||||
// ^--: the current character begin to token the ASCII
|
||||
// letter
|
||||
start = offset - 1; |
||||
} else if (tokenType == DOUBLE_TOKEN_TYPE) { |
||||
// "javaC1C2C3C4linux" <br>
|
||||
// ^--: the previous non-ASCII
|
||||
// : the current character
|
||||
offset--; |
||||
bufferIndex--; |
||||
|
||||
if (preIsTokened == true) { |
||||
// there is only one non-ASCII has been stored
|
||||
length = 0; |
||||
preIsTokened = false; |
||||
break; |
||||
} else { |
||||
break; |
||||
} |
||||
} |
||||
|
||||
// store the LowerCase(c) in the buffer
|
||||
buffer[length++] = Character.toLowerCase(c); |
||||
tokenType = SINGLE_TOKEN_TYPE; |
||||
|
||||
// break the procedure if buffer overflowed!
|
||||
if (length == MAX_WORD_LEN) { |
||||
break; |
||||
} |
||||
} else if (length > 0) { |
||||
if (preIsTokened == true) { |
||||
length = 0; |
||||
preIsTokened = false; |
||||
} else { |
||||
break; |
||||
} |
||||
} |
||||
} else { |
||||
// non-ASCII letter, e.g."C1C2C3C4"
|
||||
if (Character.isLetter(c)) { |
||||
if (length == 0) { |
||||
start = offset - 1; |
||||
buffer[length++] = c; |
||||
tokenType = DOUBLE_TOKEN_TYPE; |
||||
} else { |
||||
if (tokenType == SINGLE_TOKEN_TYPE) { |
||||
offset--; |
||||
bufferIndex--; |
||||
|
||||
//return the previous ASCII characters
|
||||
break; |
||||
} else { |
||||
buffer[length++] = c; |
||||
tokenType = DOUBLE_TOKEN_TYPE; |
||||
|
||||
if (length == 2) { |
||||
offset--; |
||||
bufferIndex--; |
||||
preIsTokened = true; |
||||
|
||||
break; |
||||
} |
||||
} |
||||
} |
||||
} else if (length > 0) { |
||||
if (preIsTokened == true) { |
||||
// empty the buffer
|
||||
length = 0; |
||||
preIsTokened = false; |
||||
} else { |
||||
break; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
if (length > 0) { |
||||
termAtt.copyBuffer(buffer, 0, length); |
||||
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length)); |
||||
typeAtt.setType(TOKEN_TYPE_NAMES[tokenType]); |
||||
return true; |
||||
} else if (dataLen == -1) { |
||||
offset--; |
||||
return false; |
||||
} |
||||
|
||||
// Cycle back and try for the next token (don't
|
||||
// return an empty string)
|
||||
} |
||||
} |
||||
|
||||
@Override |
||||
public final void end() { |
||||
// set final offset
|
||||
final int finalOffset = correctOffset(offset); |
||||
this.offsetAtt.setOffset(finalOffset, finalOffset); |
||||
} |
||||
|
||||
@Override |
||||
public void reset() throws IOException { |
||||
super.reset(); |
||||
offset = bufferIndex = dataLen = 0; |
||||
preIsTokened = false; |
||||
tokenType = WORD_TYPE; |
||||
} |
||||
} |
@ -0,0 +1,41 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.cjk; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.cjk.CJKTokenizer; |
||||
import com.fr.third.org.apache.lucene.analysis.util.TokenizerFactory; |
||||
|
||||
import java.io.Reader; |
||||
|
||||
/** |
||||
* Factory for {@link CJKTokenizer}. |
||||
* <pre class="prettyprint" > |
||||
* <fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100"> |
||||
* <analyzer> |
||||
* <tokenizer class="solr.CJKTokenizerFactory"/> |
||||
* </analyzer> |
||||
* </fieldType></pre> |
||||
* @deprecated Use {@link CJKBigramFilterFactory} instead. |
||||
*/ |
||||
@Deprecated |
||||
public class CJKTokenizerFactory extends TokenizerFactory { |
||||
public CJKTokenizer create(Reader in) { |
||||
return new CJKTokenizer(in); |
||||
} |
||||
} |
||||
|
@ -0,0 +1,112 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.cjk; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.IOException; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.TokenFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.TokenStream; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.util.StemmerUtil; |
||||
|
||||
/** |
||||
* A {@link TokenFilter} that normalizes CJK width differences: |
||||
* <ul> |
||||
* <li>Folds fullwidth ASCII variants into the equivalent basic latin |
||||
* <li>Folds halfwidth Katakana variants into the equivalent kana |
||||
* </ul> |
||||
* <p> |
||||
* NOTE: this filter can be viewed as a (practical) subset of NFKC/NFKD |
||||
* Unicode normalization. See the normalization support in the ICU package
|
||||
* for full normalization. |
||||
*/ |
||||
public final class CJKWidthFilter extends TokenFilter { |
||||
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
||||
|
||||
/* halfwidth kana mappings: 0xFF65-0xFF9D |
||||
* |
||||
* note: 0xFF9C and 0xFF9D are only mapped to 0x3099 and 0x309A |
||||
* as a fallback when they cannot properly combine with a preceding |
||||
* character into a composed form. |
||||
*/ |
||||
private static final char KANA_NORM[] = new char[] { |
||||
0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5, |
||||
0x30e7, 0x30c3, 0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab, |
||||
0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd, |
||||
0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd, |
||||
0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, 0x30df, 0x30e0, |
||||
0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec, |
||||
0x30ed, 0x30ef, 0x30f3, 0x3099, 0x309A |
||||
}; |
||||
|
||||
public CJKWidthFilter(TokenStream input) { |
||||
super(input); |
||||
} |
||||
|
||||
public boolean incrementToken() throws IOException { |
||||
if (input.incrementToken()) { |
||||
char text[] = termAtt.buffer(); |
||||
int length = termAtt.length(); |
||||
for (int i = 0; i < length; i++) { |
||||
final char ch = text[i]; |
||||
if (ch >= 0xFF01 && ch <= 0xFF5E) { |
||||
// Fullwidth ASCII variants
|
||||
text[i] -= 0xFEE0; |
||||
} else if (ch >= 0xFF65 && ch <= 0xFF9F) { |
||||
// Halfwidth Katakana variants
|
||||
if ((ch == 0xFF9E || ch == 0xFF9F) && i > 0 && combine(text, i, ch)) { |
||||
length = StemmerUtil.delete(text, i--, length); |
||||
} else { |
||||
text[i] = KANA_NORM[ch - 0xFF65]; |
||||
} |
||||
} |
||||
} |
||||
termAtt.setLength(length); |
||||
return true; |
||||
} else { |
||||
return false; |
||||
} |
||||
} |
||||
|
||||
/* kana combining diffs: 0x30A6-0x30FD */ |
||||
private static final byte KANA_COMBINE_VOICED[] = new byte[] { |
||||
78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, |
||||
0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, |
||||
0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
||||
0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 |
||||
}; |
||||
|
||||
private static final byte KANA_COMBINE_HALF_VOICED[] = new byte[] { |
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, |
||||
0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
||||
}; |
||||
|
||||
/** returns true if we successfully combined the voice mark */ |
||||
private static boolean combine(char text[], int pos, char ch) { |
||||
final char prev = text[pos-1]; |
||||
if (prev >= 0x30A6 && prev <= 0x30FD) { |
||||
text[pos-1] += (ch == 0xFF9F) |
||||
? KANA_COMBINE_HALF_VOICED[prev - 0x30A6] |
||||
: KANA_COMBINE_VOICED[prev - 0x30A6]; |
||||
return text[pos-1] != prev; |
||||
} |
||||
return false; |
||||
} |
||||
} |
@ -0,0 +1,50 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.cjk; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.TokenStream; |
||||
import com.fr.third.org.apache.lucene.analysis.cjk.CJKWidthFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.util.AbstractAnalysisFactory; |
||||
import com.fr.third.org.apache.lucene.analysis.util.MultiTermAwareComponent; |
||||
import com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory; |
||||
|
||||
/** |
||||
* Factory for {@link CJKWidthFilter}. |
||||
* <pre class="prettyprint" > |
||||
* <fieldType name="text_cjk" class="solr.TextField"> |
||||
* <analyzer> |
||||
* <tokenizer class="solr.StandardTokenizerFactory"/> |
||||
* <filter class="solr.CJKWidthFilterFactory"/> |
||||
* <filter class="solr.LowerCaseFilterFactory"/> |
||||
* <filter class="solr.CJKBigramFilterFactory"/> |
||||
* </analyzer> |
||||
* </fieldType></pre> |
||||
*/ |
||||
|
||||
public class CJKWidthFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { |
||||
|
||||
@Override |
||||
public TokenStream create(TokenStream input) { |
||||
return new CJKWidthFilter(input); |
||||
} |
||||
|
||||
@Override |
||||
public AbstractAnalysisFactory getMultiTermComponent() { |
||||
return this; |
||||
} |
||||
} |
@ -0,0 +1,42 @@
|
||||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> |
||||
<!-- |
||||
Licensed to the Apache Software Foundation (ASF) under one or more |
||||
contributor license agreements. See the NOTICE file distributed with |
||||
this work for additional information regarding copyright ownership. |
||||
The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
(the "License"); you may not use this file except in compliance with |
||||
the License. You may obtain a copy of the License at |
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
||||
Unless required by applicable law or agreed to in writing, software |
||||
distributed under the License is distributed on an "AS IS" BASIS, |
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
See the License for the specific language governing permissions and |
||||
limitations under the License. |
||||
--> |
||||
<html> |
||||
<head> |
||||
<META http-equiv="Content-Type" content="text/html; charset=UTF-8"> |
||||
</head> |
||||
<body> |
||||
Analyzer for Chinese, Japanese, and Korean, which indexes bigrams. |
||||
This analyzer generates bigram terms, which are overlapping groups of two adjacent Han, Hiragana, Katakana, or Hangul characters. |
||||
<p> |
||||
Three analyzers are provided for Chinese, each of which treats Chinese text in a different way. |
||||
<ul> |
||||
<li>ChineseAnalyzer (in the analyzers/cn package): Index unigrams (individual Chinese characters) as a token. |
||||
<li>CJKAnalyzer (in this package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens. |
||||
<li>SmartChineseAnalyzer (in the analyzers/smartcn package): Index words (attempt to segment Chinese text into words) as tokens. |
||||
</ul> |
||||
|
||||
Example phrase: "我是中国人" |
||||
<ol> |
||||
<li>ChineseAnalyzer: 我-是-中-国-人</li> |
||||
<li>CJKAnalyzer: 我是-是中-中国-国人</li> |
||||
<li>SmartChineseAnalyzer: 我-是-中国-人</li> |
||||
</ol> |
||||
</p> |
||||
|
||||
</body> |
||||
</html> |
@ -0,0 +1,50 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.cn; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.Reader; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.standard.StandardAnalyzer; // javadoc @link
|
||||
import com.fr.third.org.apache.lucene.analysis.Analyzer; |
||||
import com.fr.third.org.apache.lucene.analysis.Tokenizer; |
||||
|
||||
/** |
||||
* An {@link Analyzer} that tokenizes text with {@link ChineseTokenizer} and |
||||
* filters with {@link ChineseFilter} |
||||
* @deprecated (3.1) Use {@link StandardAnalyzer} instead, which has the same functionality. |
||||
* This analyzer will be removed in Lucene 5.0 |
||||
*/ |
||||
@Deprecated |
||||
public final class ChineseAnalyzer extends Analyzer { |
||||
|
||||
/** |
||||
* Creates |
||||
* {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents} |
||||
* used to tokenize all the text in the provided {@link Reader}. |
||||
* |
||||
* @return {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents} |
||||
* built from a {@link ChineseTokenizer} filtered with |
||||
* {@link ChineseFilter} |
||||
*/ |
||||
@Override |
||||
protected TokenStreamComponents createComponents(String fieldName, |
||||
Reader reader) { |
||||
final Tokenizer source = new ChineseTokenizer(reader); |
||||
return new TokenStreamComponents(source, new ChineseFilter(source)); |
||||
} |
||||
} |
@ -0,0 +1,104 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.cn; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.IOException; |
||||
import java.util.Arrays; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.TokenFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.TokenStream; |
||||
import com.fr.third.org.apache.lucene.analysis.core.StopFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.util.CharArraySet; |
||||
import com.fr.third.org.apache.lucene.util.Version; |
||||
|
||||
/** |
||||
* A {@link TokenFilter} with a stop word table. |
||||
* <ul> |
||||
* <li>Numeric tokens are removed. |
||||
* <li>English tokens must be larger than 1 character. |
||||
* <li>One Chinese character as one Chinese word. |
||||
* </ul> |
||||
* TO DO: |
||||
* <ol> |
||||
* <li>Add Chinese stop words, such as \ue400 |
||||
* <li>Dictionary based Chinese word extraction |
||||
* <li>Intelligent Chinese word extraction |
||||
* </ol> |
||||
* |
||||
* @deprecated (3.1) Use {@link StopFilter} instead, which has the same functionality. |
||||
* This filter will be removed in Lucene 5.0 |
||||
*/ |
||||
@Deprecated |
||||
public final class ChineseFilter extends TokenFilter { |
||||
|
||||
|
||||
// Only English now, Chinese to be added later.
|
||||
public static final String[] STOP_WORDS = { |
||||
"and", "are", "as", "at", "be", "but", "by", |
||||
"for", "if", "in", "into", "is", "it", |
||||
"no", "not", "of", "on", "or", "such", |
||||
"that", "the", "their", "then", "there", "these", |
||||
"they", "this", "to", "was", "will", "with" |
||||
}; |
||||
|
||||
|
||||
private CharArraySet stopTable; |
||||
|
||||
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
||||
|
||||
public ChineseFilter(TokenStream in) { |
||||
super(in); |
||||
|
||||
stopTable = new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS), false); |
||||
} |
||||
|
||||
@Override |
||||
public boolean incrementToken() throws IOException { |
||||
|
||||
while (input.incrementToken()) { |
||||
char text[] = termAtt.buffer(); |
||||
int termLength = termAtt.length(); |
||||
|
||||
// why not key off token type here assuming ChineseTokenizer comes first?
|
||||
if (!stopTable.contains(text, 0, termLength)) { |
||||
switch (Character.getType(text[0])) { |
||||
|
||||
case Character.LOWERCASE_LETTER: |
||||
case Character.UPPERCASE_LETTER: |
||||
|
||||
// English word/token should larger than 1 character.
|
||||
if (termLength>1) { |
||||
return true; |
||||
} |
||||
break; |
||||
case Character.OTHER_LETTER: |
||||
|
||||
// One Chinese character as one Chinese word.
|
||||
// Chinese word extraction to be added later here.
|
||||
|
||||
return true; |
||||
} |
||||
|
||||
} |
||||
|
||||
} |
||||
return false; |
||||
} |
||||
|
||||
} |
@ -0,0 +1,36 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.cn; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.TokenStream; |
||||
import com.fr.third.org.apache.lucene.analysis.cn.ChineseFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.core.StopFilterFactory; // javadocs
|
||||
import com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory; |
||||
|
||||
/** |
||||
* Factory for {@link ChineseFilter} |
||||
* @deprecated Use {@link StopFilterFactory} instead. |
||||
*/ |
||||
@Deprecated |
||||
public class ChineseFilterFactory extends TokenFilterFactory { |
||||
|
||||
public ChineseFilter create(TokenStream in) { |
||||
return new ChineseFilter(in); |
||||
} |
||||
} |
||||
|
@ -0,0 +1,169 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.cn; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
|
||||
import java.io.IOException; |
||||
import java.io.Reader; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer; |
||||
import com.fr.third.org.apache.lucene.analysis.Tokenizer; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
||||
import com.fr.third.org.apache.lucene.util.AttributeSource; |
||||
|
||||
|
||||
/** |
||||
* Tokenize Chinese text as individual chinese characters. |
||||
* |
||||
* <p> |
||||
* The difference between ChineseTokenizer and |
||||
* CJKTokenizer is that they have different |
||||
* token parsing logic. |
||||
* </p> |
||||
* <p> |
||||
* For example, if the Chinese text |
||||
* "C1C2C3C4" is to be indexed: |
||||
* <ul> |
||||
* <li>The tokens returned from ChineseTokenizer are C1, C2, C3, C4. |
||||
* <li>The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4. |
||||
* </ul> |
||||
* </p> |
||||
* <p> |
||||
* Therefore the index created by CJKTokenizer is much larger. |
||||
* </p> |
||||
* <p> |
||||
* The problem is that when searching for C1, C1C2, C1C3, |
||||
* C4C2, C1C2C3 ... the ChineseTokenizer works, but the |
||||
* CJKTokenizer will not work. |
||||
* </p> |
||||
* @deprecated (3.1) Use {@link StandardTokenizer} instead, which has the same functionality. |
||||
* This filter will be removed in Lucene 5.0 |
||||
*/ |
||||
@Deprecated |
||||
public final class ChineseTokenizer extends Tokenizer { |
||||
|
||||
|
||||
public ChineseTokenizer(Reader in) { |
||||
super(in); |
||||
} |
||||
|
||||
public ChineseTokenizer(AttributeSource source, Reader in) { |
||||
super(source, in); |
||||
} |
||||
|
||||
public ChineseTokenizer(AttributeFactory factory, Reader in) { |
||||
super(factory, in); |
||||
} |
||||
|
||||
private int offset = 0, bufferIndex=0, dataLen=0; |
||||
private final static int MAX_WORD_LEN = 255; |
||||
private final static int IO_BUFFER_SIZE = 1024; |
||||
private final char[] buffer = new char[MAX_WORD_LEN]; |
||||
private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; |
||||
|
||||
|
||||
private int length; |
||||
private int start; |
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); |
||||
|
||||
private final void push(char c) { |
||||
|
||||
if (length == 0) start = offset-1; // start of token
|
||||
buffer[length++] = Character.toLowerCase(c); // buffer it
|
||||
|
||||
} |
||||
|
||||
private final boolean flush() { |
||||
|
||||
if (length>0) { |
||||
//System.out.println(new String(buffer, 0,
|
||||
//length));
|
||||
termAtt.copyBuffer(buffer, 0, length); |
||||
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length)); |
||||
return true; |
||||
} |
||||
else |
||||
return false; |
||||
} |
||||
|
||||
@Override |
||||
public boolean incrementToken() throws IOException { |
||||
clearAttributes(); |
||||
|
||||
length = 0; |
||||
start = offset; |
||||
|
||||
|
||||
while (true) { |
||||
|
||||
final char c; |
||||
offset++; |
||||
|
||||
if (bufferIndex >= dataLen) { |
||||
dataLen = input.read(ioBuffer); |
||||
bufferIndex = 0; |
||||
} |
||||
|
||||
if (dataLen == -1) { |
||||
offset--; |
||||
return flush(); |
||||
} else |
||||
c = ioBuffer[bufferIndex++]; |
||||
|
||||
|
||||
switch(Character.getType(c)) { |
||||
|
||||
case Character.DECIMAL_DIGIT_NUMBER: |
||||
case Character.LOWERCASE_LETTER: |
||||
case Character.UPPERCASE_LETTER: |
||||
push(c); |
||||
if (length == MAX_WORD_LEN) return flush(); |
||||
break; |
||||
|
||||
case Character.OTHER_LETTER: |
||||
if (length>0) { |
||||
bufferIndex--; |
||||
offset--; |
||||
return flush(); |
||||
} |
||||
push(c); |
||||
return flush(); |
||||
|
||||
default: |
||||
if (length>0) return flush(); |
||||
break; |
||||
} |
||||
} |
||||
} |
||||
|
||||
@Override |
||||
public final void end() { |
||||
// set final offset
|
||||
final int finalOffset = correctOffset(offset); |
||||
this.offsetAtt.setOffset(finalOffset, finalOffset); |
||||
} |
||||
|
||||
@Override |
||||
public void reset() throws IOException { |
||||
super.reset(); |
||||
offset = bufferIndex = dataLen = 0; |
||||
} |
||||
} |
@ -0,0 +1,37 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.cn; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.Reader; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.cn.ChineseTokenizer; |
||||
import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizerFactory; // javadocs
|
||||
import com.fr.third.org.apache.lucene.analysis.util.TokenizerFactory; |
||||
|
||||
/** |
||||
* Factory for {@link ChineseTokenizer} |
||||
* @deprecated Use {@link StandardTokenizerFactory} instead. |
||||
*/ |
||||
@Deprecated |
||||
public class ChineseTokenizerFactory extends TokenizerFactory { |
||||
|
||||
public ChineseTokenizer create(Reader in) { |
||||
return new ChineseTokenizer(in); |
||||
} |
||||
} |
||||
|
@ -0,0 +1,41 @@
|
||||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> |
||||
<!-- |
||||
Licensed to the Apache Software Foundation (ASF) under one or more |
||||
contributor license agreements. See the NOTICE file distributed with |
||||
this work for additional information regarding copyright ownership. |
||||
The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
(the "License"); you may not use this file except in compliance with |
||||
the License. You may obtain a copy of the License at |
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
||||
Unless required by applicable law or agreed to in writing, software |
||||
distributed under the License is distributed on an "AS IS" BASIS, |
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
See the License for the specific language governing permissions and |
||||
limitations under the License. |
||||
--> |
||||
<html> |
||||
<head> |
||||
<META http-equiv="Content-Type" content="text/html; charset=UTF-8"> |
||||
</head> |
||||
<body> |
||||
Analyzer for Chinese, which indexes unigrams (individual chinese characters). |
||||
<p> |
||||
Three analyzers are provided for Chinese, each of which treats Chinese text in a different way. |
||||
<ul> |
||||
<li>StandardAnalyzer: Index unigrams (individual Chinese characters) as a token. |
||||
<li>CJKAnalyzer (in the analyzers/cjk package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens. |
||||
<li>SmartChineseAnalyzer (in the analyzers/smartcn package): Index words (attempt to segment Chinese text into words) as tokens. |
||||
</ul> |
||||
|
||||
Example phrase: "我是中国人" |
||||
<ol> |
||||
<li>StandardAnalyzer: 我-是-中-国-人</li> |
||||
<li>CJKAnalyzer: 我是-是中-中国-国人</li> |
||||
<li>SmartChineseAnalyzer: 我-是-中国-人</li> |
||||
</ol> |
||||
</p> |
||||
|
||||
</body> |
||||
</html> |
@ -0,0 +1,176 @@
|
||||
/* |
||||
* Licensed under the Apache License, |
||||
* Version 2.0 (the "License"); you may not use this file except in compliance with the License. |
||||
* You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software distributed under the License |
||||
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and limitations under the License. |
||||
*/ |
||||
|
||||
package com.fr.third.org.apache.lucene.analysis.commongrams; |
||||
|
||||
import java.io.IOException; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.TokenFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.TokenStream; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.util.CharArraySet; |
||||
import com.fr.third.org.apache.lucene.util.Version; |
||||
|
||||
/* |
||||
* TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors
|
||||
*/ |
||||
|
||||
/** |
||||
* Construct bigrams for frequently occurring terms while indexing. Single terms |
||||
* are still indexed too, with bigrams overlaid. This is achieved through the |
||||
* use of {@link PositionIncrementAttribute#setPositionIncrement(int)}. Bigrams have a type |
||||
* of {@link #GRAM_TYPE} Example: |
||||
* <ul> |
||||
* <li>input:"the quick brown fox"</li> |
||||
* <li>output:|"the","the-quick"|"brown"|"fox"|</li> |
||||
* <li>"the-quick" has a position increment of 0 so it is in the same position |
||||
* as "the" "the-quick" has a term.type() of "gram"</li> |
||||
* |
||||
* </ul> |
||||
*/ |
||||
|
||||
/* |
||||
* Constructors and makeCommonSet based on similar code in StopFilter |
||||
*/ |
||||
public final class CommonGramsFilter extends TokenFilter { |
||||
|
||||
public static final String GRAM_TYPE = "gram"; |
||||
private static final char SEPARATOR = '_'; |
||||
|
||||
private final CharArraySet commonWords; |
||||
|
||||
private final StringBuilder buffer = new StringBuilder(); |
||||
|
||||
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); |
||||
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); |
||||
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class); |
||||
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class); |
||||
private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class); |
||||
|
||||
private int lastStartOffset; |
||||
private boolean lastWasCommon; |
||||
private State savedState; |
||||
|
||||
/** |
||||
* Construct a token stream filtering the given input using a Set of common |
||||
* words to create bigrams. Outputs both unigrams with position increment and |
||||
* bigrams with position increment 0 type=gram where one or both of the words |
||||
* in a potential bigram are in the set of common words . |
||||
* |
||||
* @param input TokenStream input in filter chain |
||||
* @param commonWords The set of common words. |
||||
*/ |
||||
public CommonGramsFilter(Version matchVersion, TokenStream input, CharArraySet commonWords) { |
||||
super(input); |
||||
this.commonWords = commonWords; |
||||
} |
||||
|
||||
/** |
||||
* Inserts bigrams for common words into a token stream. For each input token, |
||||
* output the token. If the token and/or the following token are in the list |
||||
* of common words also output a bigram with position increment 0 and |
||||
* type="gram" |
||||
* |
||||
* TODO:Consider adding an option to not emit unigram stopwords |
||||
* as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be |
||||
* changed to work with this. |
||||
* |
||||
* TODO: Consider optimizing for the case of three |
||||
* commongrams i.e "man of the year" normally produces 3 bigrams: "man-of", |
||||
* "of-the", "the-year" but with proper management of positions we could |
||||
* eliminate the middle bigram "of-the"and save a disk seek and a whole set of |
||||
* position lookups. |
||||
*/ |
||||
@Override |
||||
public boolean incrementToken() throws IOException { |
||||
// get the next piece of input
|
||||
if (savedState != null) { |
||||
restoreState(savedState); |
||||
savedState = null; |
||||
saveTermBuffer(); |
||||
return true; |
||||
} else if (!input.incrementToken()) { |
||||
return false; |
||||
} |
||||
|
||||
/* We build n-grams before and after stopwords. |
||||
* When valid, the buffer always contains at least the separator. |
||||
* If its empty, there is nothing before this stopword. |
||||
*/ |
||||
if (lastWasCommon || (isCommon() && buffer.length() > 0)) { |
||||
savedState = captureState(); |
||||
gramToken(); |
||||
return true; |
||||
} |
||||
|
||||
saveTermBuffer(); |
||||
return true; |
||||
} |
||||
|
||||
/** |
||||
* {@inheritDoc} |
||||
*/ |
||||
@Override |
||||
public void reset() throws IOException { |
||||
super.reset(); |
||||
lastWasCommon = false; |
||||
savedState = null; |
||||
buffer.setLength(0); |
||||
} |
||||
|
||||
// ================================================= Helper Methods ================================================
|
||||
|
||||
/** |
||||
* Determines if the current token is a common term |
||||
* |
||||
* @return {@code true} if the current token is a common term, {@code false} otherwise |
||||
*/ |
||||
private boolean isCommon() { |
||||
return commonWords != null && commonWords.contains(termAttribute.buffer(), 0, termAttribute.length()); |
||||
} |
||||
|
||||
/** |
||||
* Saves this information to form the left part of a gram |
||||
*/ |
||||
private void saveTermBuffer() { |
||||
buffer.setLength(0); |
||||
buffer.append(termAttribute.buffer(), 0, termAttribute.length()); |
||||
buffer.append(SEPARATOR); |
||||
lastStartOffset = offsetAttribute.startOffset(); |
||||
lastWasCommon = isCommon(); |
||||
} |
||||
|
||||
/** |
||||
* Constructs a compound token. |
||||
*/ |
||||
private void gramToken() { |
||||
buffer.append(termAttribute.buffer(), 0, termAttribute.length()); |
||||
int endOffset = offsetAttribute.endOffset(); |
||||
|
||||
clearAttributes(); |
||||
|
||||
int length = buffer.length(); |
||||
char termText[] = termAttribute.buffer(); |
||||
if (length > termText.length) { |
||||
termText = termAttribute.resizeBuffer(length); |
||||
} |
||||
|
||||
buffer.getChars(0, length, termText, 0); |
||||
termAttribute.setLength(length); |
||||
posIncAttribute.setPositionIncrement(0); |
||||
posLenAttribute.setPositionLength(2); // bigram
|
||||
offsetAttribute.setOffset(lastStartOffset, endOffset); |
||||
typeAttribute.setType(GRAM_TYPE); |
||||
buffer.setLength(0); |
||||
} |
||||
} |
@ -0,0 +1,79 @@
|
||||
package com.fr.third.org.apache.lucene.analysis.commongrams; |
||||
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
import java.io.IOException; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.TokenStream; |
||||
import com.fr.third.org.apache.lucene.analysis.commongrams.CommonGramsFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.core.StopAnalyzer; |
||||
import com.fr.third.org.apache.lucene.analysis.util.*; |
||||
|
||||
/** |
||||
* Constructs a {@link CommonGramsFilter}. |
||||
* <pre class="prettyprint" > |
||||
* <fieldType name="text_cmmngrms" class="solr.TextField" positionIncrementGap="100"> |
||||
* <analyzer> |
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/> |
||||
* <filter class="solr.CommonGramsFilterFactory" words="commongramsstopwords.txt" ignoreCase="false"/> |
||||
* </analyzer> |
||||
* </fieldType></pre> |
||||
* |
||||
*/ |
||||
|
||||
/* |
||||
* This is pretty close to a straight copy from StopFilterFactory |
||||
*/ |
||||
public class CommonGramsFilterFactory extends TokenFilterFactory implements |
||||
ResourceLoaderAware { |
||||
|
||||
public void inform(ResourceLoader loader) throws IOException { |
||||
String commonWordFiles = args.get("words"); |
||||
ignoreCase = getBoolean("ignoreCase", false); |
||||
|
||||
if (commonWordFiles != null) { |
||||
if ("snowball".equalsIgnoreCase(args.get("format"))) { |
||||
commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase); |
||||
} else { |
||||
commonWords = getWordSet(loader, commonWordFiles, ignoreCase); |
||||
} |
||||
} else { |
||||
commonWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; |
||||
} |
||||
} |
||||
|
||||
//Force the use of a char array set, as it is the most performant, although this may break things if Lucene ever goes away from it. See SOLR-1095
|
||||
private CharArraySet commonWords; |
||||
private boolean ignoreCase; |
||||
|
||||
public boolean isIgnoreCase() { |
||||
return ignoreCase; |
||||
} |
||||
|
||||
public CharArraySet getCommonWords() { |
||||
return commonWords; |
||||
} |
||||
|
||||
public CommonGramsFilter create(TokenStream input) { |
||||
CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords); |
||||
return commonGrams; |
||||
} |
||||
} |
||||
|
||||
|
||||
|
@ -0,0 +1,126 @@
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
package com.fr.third.org.apache.lucene.analysis.commongrams; |
||||
|
||||
import java.io.IOException; |
||||
|
||||
import com.fr.third.org.apache.lucene.analysis.TokenFilter; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
||||
import com.fr.third.org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
||||
|
||||
import static com.fr.third.org.apache.lucene.analysis.commongrams.CommonGramsFilter.GRAM_TYPE; |
||||
|
||||
/** |
||||
* Wrap a CommonGramsFilter optimizing phrase queries by only returning single |
||||
* words when they are not a member of a bigram. |
||||
* |
||||
* Example: |
||||
* <ul> |
||||
* <li>query input to CommonGramsFilter: "the rain in spain falls mainly" |
||||
* <li>output of CommomGramsFilter/input to CommonGramsQueryFilter: |
||||
* |"the, "the-rain"|"rain" "rain-in"|"in, "in-spain"|"spain"|"falls"|"mainly" |
||||
* <li>output of CommonGramsQueryFilter:"the-rain", "rain-in" ,"in-spain", |
||||
* "falls", "mainly" |
||||
* </ul> |
||||
*/ |
||||
|
||||
/* |
||||
* See:http://hudson.zones.apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache/lucene/analysis/TokenStream.html and
|
||||
* http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/analysis/package.html?revision=718798
|
||||
*/ |
||||
public final class CommonGramsQueryFilter extends TokenFilter { |
||||
|
||||
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class); |
||||
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class); |
||||
|
||||
private State previous; |
||||
private String previousType; |
||||
private boolean exhausted; |
||||
|
||||
/** |
||||
* Constructs a new CommonGramsQueryFilter based on the provided CommomGramsFilter |
||||
* |
||||
* @param input CommonGramsFilter the QueryFilter will use |
||||
*/ |
||||
public CommonGramsQueryFilter(CommonGramsFilter input) { |
||||
super(input); |
||||
} |
||||
|
||||
/** |
||||
* {@inheritDoc} |
||||
*/ |
||||
@Override |
||||
public void reset() throws IOException { |
||||
super.reset(); |
||||
previous = null; |
||||
previousType = null; |
||||
exhausted = false; |
||||
} |
||||
|
||||
/** |
||||
* Output bigrams whenever possible to optimize queries. Only output unigrams |
||||
* when they are not a member of a bigram. Example: |
||||
* <ul> |
||||
* <li>input: "the rain in spain falls mainly" |
||||
* <li>output:"the-rain", "rain-in" ,"in-spain", "falls", "mainly" |
||||
* </ul> |
||||
*/ |
||||
@Override |
||||
public boolean incrementToken() throws IOException { |
||||
while (!exhausted && input.incrementToken()) { |
||||
State current = captureState(); |
||||
|
||||
if (previous != null && !isGramType()) { |
||||
restoreState(previous); |
||||
previous = current; |
||||
previousType = typeAttribute.type(); |
||||
|
||||
if (isGramType()) { |
||||
posIncAttribute.setPositionIncrement(1); |
||||
} |
||||
return true; |
||||
} |
||||
|
||||
previous = current; |
||||
} |
||||
|
||||
exhausted = true; |
||||
|
||||
if (previous == null || GRAM_TYPE.equals(previousType)) { |
||||
return false; |
||||
} |
||||
|
||||
restoreState(previous); |
||||
previous = null; |
||||
|
||||
if (isGramType()) { |
||||
posIncAttribute.setPositionIncrement(1); |
||||
} |
||||
return true; |
||||
} |
||||
|
||||
// ================================================= Helper Methods ================================================
|
||||
|
||||
/** |
||||
* Convenience method to check if the current type is a gram type |
||||
* |
||||
* @return {@code true} if the current type is a gram type, {@code false} otherwise |
||||
*/ |
||||
public boolean isGramType() { |
||||
return GRAM_TYPE.equals(typeAttribute.type()); |
||||
} |
||||
} |
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue