Browse Source

lucene4.0 改包名

10.0
CL 7 years ago
parent
commit
d75ef79fe8
  1. 6
      build.third_step6.gradle
  2. 19
      fine-lucene/resources/META-INF/services/com.fr.third.org.apache.lucene.analysis.util.CharFilterFactory
  3. 92
      fine-lucene/resources/META-INF/services/com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory
  4. 31
      fine-lucene/resources/META-INF/services/com.fr.third.org.apache.lucene.analysis.util.TokenizerFactory
  5. 17
      fine-lucene/resources/META-INF/services/com.fr.third.org.apache.lucene.codecs.Codec
  6. 16
      fine-lucene/resources/META-INF/services/com.fr.third.org.apache.lucene.codecs.PostingsFormat
  7. 125
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/ar/stopwords.txt
  8. 193
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/bg/stopwords.txt
  9. 128
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/br/stopwords.txt
  10. 220
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/ca/stopwords.txt
  11. 35
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/cjk/stopwords.txt
  12. 67
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd
  13. 172
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/cz/stopwords.txt
  14. 78
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/el/stopwords.txt
  15. 99
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/eu/stopwords.txt
  16. 313
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/fa/stopwords.txt
  17. 110
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/ga/stopwords.txt
  18. 647
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/gl/galician.rslp
  19. 161
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/gl/stopwords.txt
  20. 235
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/hi/stopwords.txt
  21. 46
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/hy/stopwords.txt
  22. 359
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/id/stopwords.txt
  23. 172
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/lv/stopwords.txt
  24. 456
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/pt/portuguese.rslp
  25. 233
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/ro/stopwords.txt
  26. 108
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/danish_stop.txt
  27. 117
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/dutch_stop.txt
  28. 317
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/english_stop.txt
  29. 95
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/finnish_stop.txt
  30. 183
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/french_stop.txt
  31. 292
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/german_stop.txt
  32. 209
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/hungarian_stop.txt
  33. 301
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/italian_stop.txt
  34. 192
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/norwegian_stop.txt
  35. 251
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/portuguese_stop.txt
  36. 241
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/russian_stop.txt
  37. 354
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/spanish_stop.txt
  38. 131
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/swedish_stop.txt
  39. 119
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/th/stopwords.txt
  40. 212
      fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/tr/stopwords.txt
  41. 29
      fine-lucene/src/com/fr/third/org/apache/lucene/LucenePackage.java
  42. 393
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/Analyzer.java
  43. 83
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/AnalyzerWrapper.java
  44. 98
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/CachingTokenFilter.java
  45. 84
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/CharFilter.java
  46. 321
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/NumericTokenStream.java
  47. 651
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/Token.java
  48. 72
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/TokenFilter.java
  49. 181
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/TokenStream.java
  50. 99
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/Tokenizer.java
  51. 153
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
  52. 96
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java
  53. 43
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicLetterTokenizerFactory.java
  54. 48
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
  55. 48
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicNormalizationFilterFactory.java
  56. 101
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicNormalizer.java
  57. 58
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicStemFilter.java
  58. 43
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicStemFilterFactory.java
  59. 150
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicStemmer.java
  60. 22
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/package.html
  61. 131
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
  62. 58
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/bg/BulgarianStemFilter.java
  63. 40
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/bg/BulgarianStemFilterFactory.java
  64. 143
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/bg/BulgarianStemmer.java
  65. 22
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/bg/package.html
  66. 138
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
  67. 76
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/br/BrazilianStemFilter.java
  68. 41
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/br/BrazilianStemFilterFactory.java
  69. 1024
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/br/BrazilianStemmer.java
  70. 22
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/br/package.html
  71. 148
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
  72. 22
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ca/package.html
  73. 110
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/BaseCharFilter.java
  74. 162
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
  75. 64
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
  76. 31821
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
  77. 919
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
  78. 70
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterFactory.java
  79. 191
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/MappingCharFilter.java
  80. 135
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/MappingCharFilterFactory.java
  81. 127
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java
  82. 539
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/htmlentity.py
  83. 61
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/package.html
  84. 104
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
  85. 363
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKBigramFilter.java
  86. 67
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java
  87. 311
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKTokenizer.java
  88. 41
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKTokenizerFactory.java
  89. 112
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKWidthFilter.java
  90. 50
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKWidthFilterFactory.java
  91. 42
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/package.html
  92. 50
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/ChineseAnalyzer.java
  93. 104
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/ChineseFilter.java
  94. 36
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/ChineseFilterFactory.java
  95. 169
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/ChineseTokenizer.java
  96. 37
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/ChineseTokenizerFactory.java
  97. 41
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/package.html
  98. 176
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java
  99. 79
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
  100. 126
      fine-lucene/src/com/fr/third/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilter.java
  101. Some files were not shown because too many files have changed in this diff Show More

6
build.third_step6.gradle

@ -24,8 +24,10 @@ sourceSets{
"${srcDir}/fine-jackson/src",
"${srcDir}/fine-jackson/resources",
"${srcDir}/fine-ehcache/src",
"${srcDir}/fine-ehcache/resources"
"${srcDir}/fine-ehcache/resources",
"${srcDir}/fine-guava/src",
"${srcDir}/fine-lucene/src",
"${srcDir}/fine-lucene/resources",
]
}
}
@ -72,6 +74,8 @@ task copyFiles(type:Copy,dependsOn:'compileJava'){
with dataContent.call("${srcDir}/fine-ehcache/src")
with dataContent.call("${srcDir}/fine-ehcache/resources")
with dataContent.call("${srcDir}/fine-guava/src")
with dataContent.call("${srcDir}/fine-lucene/src")
with dataContent.call("${srcDir}/fine-lucene/resources")
into "${classesDir}"
}
}

19
fine-lucene/resources/META-INF/services/com.fr.third.org.apache.lucene.analysis.util.CharFilterFactory

@ -0,0 +1,19 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
com.fr.third.org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory
com.fr.third.org.apache.lucene.analysis.charfilter.MappingCharFilterFactory
com.fr.third.org.apache.lucene.analysis.fa.PersianCharFilterFactory
com.fr.third.org.apache.lucene.analysis.pattern.PatternReplaceCharFilterFactory

92
fine-lucene/resources/META-INF/services/com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory

@ -0,0 +1,92 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
com.fr.third.org.apache.lucene.analysis.ar.ArabicNormalizationFilterFactory
com.fr.third.org.apache.lucene.analysis.ar.ArabicStemFilterFactory
com.fr.third.org.apache.lucene.analysis.bg.BulgarianStemFilterFactory
com.fr.third.org.apache.lucene.analysis.br.BrazilianStemFilterFactory
com.fr.third.org.apache.lucene.analysis.cjk.CJKBigramFilterFactory
com.fr.third.org.apache.lucene.analysis.cjk.CJKWidthFilterFactory
com.fr.third.org.apache.lucene.analysis.cn.ChineseFilterFactory
com.fr.third.org.apache.lucene.analysis.commongrams.CommonGramsFilterFactory
com.fr.third.org.apache.lucene.analysis.commongrams.CommonGramsQueryFilterFactory
com.fr.third.org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilterFactory
com.fr.third.org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilterFactory
com.fr.third.org.apache.lucene.analysis.core.LowerCaseFilterFactory
com.fr.third.org.apache.lucene.analysis.core.StopFilterFactory
com.fr.third.org.apache.lucene.analysis.core.TypeTokenFilterFactory
com.fr.third.org.apache.lucene.analysis.cz.CzechStemFilterFactory
com.fr.third.org.apache.lucene.analysis.de.GermanLightStemFilterFactory
com.fr.third.org.apache.lucene.analysis.de.GermanMinimalStemFilterFactory
com.fr.third.org.apache.lucene.analysis.de.GermanNormalizationFilterFactory
com.fr.third.org.apache.lucene.analysis.de.GermanStemFilterFactory
com.fr.third.org.apache.lucene.analysis.el.GreekLowerCaseFilterFactory
com.fr.third.org.apache.lucene.analysis.el.GreekStemFilterFactory
com.fr.third.org.apache.lucene.analysis.en.EnglishMinimalStemFilterFactory
com.fr.third.org.apache.lucene.analysis.en.EnglishPossessiveFilterFactory
com.fr.third.org.apache.lucene.analysis.en.KStemFilterFactory
com.fr.third.org.apache.lucene.analysis.en.PorterStemFilterFactory
com.fr.third.org.apache.lucene.analysis.es.SpanishLightStemFilterFactory
com.fr.third.org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory
com.fr.third.org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory
com.fr.third.org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory
com.fr.third.org.apache.lucene.analysis.fr.FrenchMinimalStemFilterFactory
com.fr.third.org.apache.lucene.analysis.ga.IrishLowerCaseFilterFactory
com.fr.third.org.apache.lucene.analysis.gl.GalicianMinimalStemFilterFactory
com.fr.third.org.apache.lucene.analysis.gl.GalicianStemFilterFactory
com.fr.third.org.apache.lucene.analysis.hi.HindiNormalizationFilterFactory
com.fr.third.org.apache.lucene.analysis.hi.HindiStemFilterFactory
com.fr.third.org.apache.lucene.analysis.hu.HungarianLightStemFilterFactory
com.fr.third.org.apache.lucene.analysis.hunspell.HunspellStemFilterFactory
com.fr.third.org.apache.lucene.analysis.id.IndonesianStemFilterFactory
com.fr.third.org.apache.lucene.analysis.in.IndicNormalizationFilterFactory
com.fr.third.org.apache.lucene.analysis.it.ItalianLightStemFilterFactory
com.fr.third.org.apache.lucene.analysis.lv.LatvianStemFilterFactory
com.fr.third.org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory
com.fr.third.org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
com.fr.third.org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory
com.fr.third.org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory
com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory
com.fr.third.org.apache.lucene.analysis.miscellaneous.LengthFilterFactory
com.fr.third.org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory
com.fr.third.org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
com.fr.third.org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
com.fr.third.org.apache.lucene.analysis.miscellaneous.TrimFilterFactory
com.fr.third.org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory
com.fr.third.org.apache.lucene.analysis.ngram.EdgeNGramFilterFactory
com.fr.third.org.apache.lucene.analysis.ngram.NGramFilterFactory
com.fr.third.org.apache.lucene.analysis.no.NorwegianLightStemFilterFactory
com.fr.third.org.apache.lucene.analysis.no.NorwegianMinimalStemFilterFactory
com.fr.third.org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory
com.fr.third.org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory
com.fr.third.org.apache.lucene.analysis.payloads.NumericPayloadTokenFilterFactory
com.fr.third.org.apache.lucene.analysis.payloads.TokenOffsetPayloadTokenFilterFactory
com.fr.third.org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory
com.fr.third.org.apache.lucene.analysis.position.PositionFilterFactory
com.fr.third.org.apache.lucene.analysis.pt.PortugueseLightStemFilterFactory
com.fr.third.org.apache.lucene.analysis.pt.PortugueseMinimalStemFilterFactory
com.fr.third.org.apache.lucene.analysis.pt.PortugueseStemFilterFactory
com.fr.third.org.apache.lucene.analysis.reverse.ReverseStringFilterFactory
com.fr.third.org.apache.lucene.analysis.ru.RussianLightStemFilterFactory
com.fr.third.org.apache.lucene.analysis.shingle.ShingleFilterFactory
com.fr.third.org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory
com.fr.third.org.apache.lucene.analysis.standard.ClassicFilterFactory
com.fr.third.org.apache.lucene.analysis.standard.StandardFilterFactory
com.fr.third.org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory
com.fr.third.org.apache.lucene.analysis.synonym.SynonymFilterFactory
com.fr.third.org.apache.lucene.analysis.th.ThaiWordFilterFactory
com.fr.third.org.apache.lucene.analysis.tr.TurkishLowerCaseFilterFactory
com.fr.third.org.apache.lucene.analysis.util.ElisionFilterFactory
com.fr.third.org.apache.lucene.collation.CollationKeyFilterFactory

31
fine-lucene/resources/META-INF/services/com.fr.third.org.apache.lucene.analysis.util.TokenizerFactory

@ -0,0 +1,31 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
com.fr.third.org.apache.lucene.analysis.ar.ArabicLetterTokenizerFactory
com.fr.third.org.apache.lucene.analysis.cjk.CJKTokenizerFactory
com.fr.third.org.apache.lucene.analysis.cn.ChineseTokenizerFactory
com.fr.third.org.apache.lucene.analysis.core.KeywordTokenizerFactory
com.fr.third.org.apache.lucene.analysis.core.LetterTokenizerFactory
com.fr.third.org.apache.lucene.analysis.core.LowerCaseTokenizerFactory
com.fr.third.org.apache.lucene.analysis.core.WhitespaceTokenizerFactory
com.fr.third.org.apache.lucene.analysis.ngram.EdgeNGramTokenizerFactory
com.fr.third.org.apache.lucene.analysis.ngram.NGramTokenizerFactory
com.fr.third.org.apache.lucene.analysis.path.PathHierarchyTokenizerFactory
com.fr.third.org.apache.lucene.analysis.pattern.PatternTokenizerFactory
com.fr.third.org.apache.lucene.analysis.ru.RussianLetterTokenizerFactory
com.fr.third.org.apache.lucene.analysis.standard.ClassicTokenizerFactory
com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizerFactory
com.fr.third.org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory
com.fr.third.org.apache.lucene.analysis.wikipedia.WikipediaTokenizerFactory

17
fine-lucene/resources/META-INF/services/com.fr.third.org.apache.lucene.codecs.Codec

@ -0,0 +1,17 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
com.fr.third.org.apache.lucene.codecs.lucene40.Lucene40Codec
com.fr.third.org.apache.lucene.codecs.lucene3x.Lucene3xCodec

16
fine-lucene/resources/META-INF/services/com.fr.third.org.apache.lucene.codecs.PostingsFormat

@ -0,0 +1,16 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
com.fr.third.org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat

125
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/ar/stopwords.txt

@ -0,0 +1,125 @@
# This file was created by Jacques Savoy and is distributed under the BSD license.
# See http://members.unine.ch/jacques.savoy/clef/index.html.
# Also see http://www.opensource.org/licenses/bsd-license.html
# Cleaned on October 11, 2009 (not normalized, so use before normalization)
# This means that when modifying this list, you might need to add some
# redundant entries, for example containing forms with both أ and ا
من
ومن
منها
منه
في
وفي
فيها
فيه
و
ف
ثم
او
أو
ب
بها
به
ا
أ
اى
اي
أي
أى
لا
ولا
الا
ألا
إلا
لكن
ما
وما
كما
فما
عن
مع
اذا
إذا
ان
أن
إن
انها
أنها
إنها
انه
أنه
إنه
بان
بأن
فان
فأن
وان
وأن
وإن
التى
التي
الذى
الذي
الذين
الى
الي
إلى
إلي
على
عليها
عليه
اما
أما
إما
ايضا
أيضا
كل
وكل
لم
ولم
لن
ولن
هى
هي
هو
وهى
وهي
وهو
فهى
فهي
فهو
انت
أنت
لك
لها
له
هذه
هذا
تلك
ذلك
هناك
كانت
كان
يكون
تكون
وكانت
وكان
غير
بعض
قد
نحو
بين
بينما
منذ
ضمن
حيث
الان
الآن
خلال
بعد
قبل
حتى
عند
عندما
لدى
جميع

193
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/bg/stopwords.txt

@ -0,0 +1,193 @@
# This file was created by Jacques Savoy and is distributed under the BSD license.
# See http://members.unine.ch/jacques.savoy/clef/index.html.
# Also see http://www.opensource.org/licenses/bsd-license.html
а
аз
ако
ала
бе
без
беше
би
бил
била
били
било
близо
бъдат
бъде
бяха
в
вас
ваш
ваша
вероятно
вече
взема
ви
вие
винаги
все
всеки
всички
всичко
всяка
във
въпреки
върху
г
ги
главно
го
д
да
дали
до
докато
докога
дори
досега
доста
е
едва
един
ето
за
зад
заедно
заради
засега
затова
защо
защото
и
из
или
им
има
имат
иска
й
каза
как
каква
какво
както
какъв
като
кога
когато
което
които
кой
който
колко
която
къде
където
към
ли
м
ме
между
мен
ми
мнозина
мога
могат
може
моля
момента
му
н
на
над
назад
най
направи
напред
например
нас
не
него
нея
ни
ние
никой
нито
но
някои
някой
няма
обаче
около
освен
особено
от
отгоре
отново
още
пак
по
повече
повечето
под
поне
поради
после
почти
прави
пред
преди
през
при
пък
първо
с
са
само
се
сега
си
скоро
след
сме
според
сред
срещу
сте
съм
със
също
т
тази
така
такива
такъв
там
твой
те
тези
ти
тн
то
това
тогава
този
той
толкова
точно
трябва
тук
тъй
тя
тях
у
харесва
ч
че
често
чрез
ще
щом
я

128
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/br/stopwords.txt

@ -0,0 +1,128 @@
a
ainda
alem
ambas
ambos
antes
ao
aonde
aos
apos
aquele
aqueles
as
assim
com
como
contra
contudo
cuja
cujas
cujo
cujos
da
das
de
dela
dele
deles
demais
depois
desde
desta
deste
dispoe
dispoem
diversa
diversas
diversos
do
dos
durante
e
ela
elas
ele
eles
em
entao
entre
essa
essas
esse
esses
esta
estas
este
estes
ha
isso
isto
logo
mais
mas
mediante
menos
mesma
mesmas
mesmo
mesmos
na
nas
nao
nas
nem
nesse
neste
nos
o
os
ou
outra
outras
outro
outros
pelas
pelas
pelo
pelos
perante
pois
por
porque
portanto
proprio
propios
quais
qual
qualquer
quando
quanto
que
quem
quer
se
seja
sem
sendo
seu
seus
sob
sobre
sua
suas
tal
tambem
teu
teus
toda
todas
todo
todos
tua
tuas
tudo
um
uma
umas
uns

220
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/ca/stopwords.txt

@ -0,0 +1,220 @@
# Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed)
a
abans
ací
ah
així
això
al
als
aleshores
algun
alguna
algunes
alguns
alhora
allà
allí
allò
altra
altre
altres
amb
ambdós
ambdues
apa
aquell
aquella
aquelles
aquells
aquest
aquesta
aquestes
aquests
aquí
baix
cada
cadascú
cadascuna
cadascunes
cadascuns
com
contra
d'un
d'una
d'unes
d'uns
dalt
de
del
dels
des
després
dins
dintre
donat
doncs
durant
e
eh
el
els
em
en
encara
ens
entre
érem
eren
éreu
es
és
esta
està
estàvem
estaven
estàveu
esteu
et
etc
ets
fins
fora
gairebé
ha
han
has
havia
he
hem
heu
hi
ho
i
igual
iguals
ja
l'hi
la
les
li
li'n
llavors
m'he
ma
mal
malgrat
mateix
mateixa
mateixes
mateixos
me
mentre
més
meu
meus
meva
meves
molt
molta
moltes
molts
mon
mons
n'he
n'hi
ne
ni
no
nogensmenys
només
nosaltres
nostra
nostre
nostres
o
oh
oi
on
pas
pel
pels
per
però
perquè
poc
poca
pocs
poques
potser
propi
qual
quals
quan
quant
que
què
quelcom
qui
quin
quina
quines
quins
s'ha
s'han
sa
semblant
semblants
ses
seu
seus
seva
seva
seves
si
sobre
sobretot
sóc
solament
sols
son
són
sons
sota
sou
t'ha
t'han
t'he
ta
tal
també
tampoc
tan
tant
tanta
tantes
teu
teus
teva
teves
ton
tons
tot
tota
totes
tots
un
una
unes
uns
us
va
vaig
vam
van
vas
veu
vosaltres
vostra
vostre
vostres

35
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/cjk/stopwords.txt

@ -0,0 +1,35 @@
a
and
are
as
at
be
but
by
for
if
in
into
is
it
no
not
of
on
or
s
such
t
that
the
their
then
there
these
they
this
to
was
will
with
www

67
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd

@ -0,0 +1,67 @@
<?xml version="1.0" encoding="US-ASCII"?>
<!--
Copyright 1999-2004 The Apache Software Foundation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,
classes, exceptions?, patterns)>
<!-- Hyphen character to be used in the exception list as shortcut for
<hyphen pre-break="-"/>. Defaults to '-'
-->
<!ELEMENT hyphen-char EMPTY>
<!ATTLIST hyphen-char value CDATA #REQUIRED>
<!-- Default minimun length in characters of hyphenated word fragments
before and after the line break. For some languages this is not
only for aesthetic purposes, wrong hyphens may be generated if this
is not accounted for.
-->
<!ELEMENT hyphen-min EMPTY>
<!ATTLIST hyphen-min before CDATA #REQUIRED>
<!ATTLIST hyphen-min after CDATA #REQUIRED>
<!-- Character equivalent classes: space separated list of character groups, all
characters in a group are to be treated equivalent as far as
the hyphenation algorithm is concerned. The first character in a group
is the group's equivalent character. Patterns should only contain
first characters. It also defines word characters, i.e. a word that
contains characters not present in any of the classes is not hyphenated.
-->
<!ELEMENT classes (#PCDATA)>
<!-- Hyphenation exceptions: space separated list of hyphenated words.
A hyphen is indicated by the hyphen tag, but you can use the
hyphen-char defined previously as shortcut. This is in cases
when the algorithm procedure finds wrong hyphens or you want
to provide your own hyphenation for some words.
-->
<!ELEMENT exceptions (#PCDATA|hyphen)* >
<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'
characters as described before, between any two word characters a digit
in the range 0 to 9 may be specified. The absence of a digit is equivalent
to zero. The '.' character is reserved to indicate begining or ending
of words. -->
<!ELEMENT patterns (#PCDATA)>
<!-- A "full hyphen" equivalent to TeX's \discretionary
with pre-break, post-break and no-break attributes.
To be used in the exceptions list, the hyphen character is not
automatically added -->
<!ELEMENT hyphen EMPTY>
<!ATTLIST hyphen pre CDATA #IMPLIED>
<!ATTLIST hyphen no CDATA #IMPLIED>
<!ATTLIST hyphen post CDATA #IMPLIED>

172
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/cz/stopwords.txt

@ -0,0 +1,172 @@
a
s
k
o
i
u
v
z
dnes
cz
tímto
budeš
budem
byli
jseš
můj
svým
ta
tomto
tohle
tuto
tyto
jej
zda
proč
máte
tato
kam
tohoto
kdo
kteří
mi
nám
tom
tomuto
mít
nic
proto
kterou
byla
toho
protože
asi
ho
naši
napište
re
což
tím
takže
svých
její
svými
jste
aj
tu
tedy
teto
bylo
kde
ke
pravé
ji
nad
nejsou
či
pod
téma
mezi
přes
ty
pak
vám
ani
když
však
neg
jsem
tento
článku
články
aby
jsme
před
pta
jejich
byl
ještě
bez
také
pouze
první
vaše
která
nás
nový
tipy
pokud
může
strana
jeho
své
jiné
zprávy
nové
není
vás
jen
podle
zde
být
více
bude
již
než
který
by
které
co
nebo
ten
tak
při
od
po
jsou
jak
další
ale
si
se
ve
to
jako
za
zpět
ze
do
pro
je
na
atd
atp
jakmile
přičemž
on
ona
ono
oni
ony
my
vy
ji
mne
jemu
tomu
těm
těmu
němu
němuž
jehož
jíž
jelikož
jež
jakož
načež

78
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/el/stopwords.txt

@ -0,0 +1,78 @@
# Lucene Greek Stopwords list
# Note: by default this file is used after GreekLowerCaseFilter,
# so when modifying this file use 'σ' instead of 'ς'
ο
η
το
οι
τα
του
τησ
των
τον
την
και
κι
κ
ειμαι
εισαι
ειναι
ειμαστε
ειστε
στο
στον
στη
στην
μα
αλλα
απο
για
προσ
με
σε
ωσ
παρα
αντι
κατα
μετα
θα
να
δε
δεν
μη
μην
επι
ενω
εαν
αν
τοτε
που
πωσ
ποιοσ
ποια
ποιο
ποιοι
ποιεσ
ποιων
ποιουσ
αυτοσ
αυτη
αυτο
αυτοι
αυτων
αυτουσ
αυτεσ
αυτα
εκεινοσ
εκεινη
εκεινο
εκεινοι
εκεινεσ
εκεινα
εκεινων
εκεινουσ
οπωσ
ομωσ
ισωσ
οσο
οτι

99
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/eu/stopwords.txt

@ -0,0 +1,99 @@
# example set of basque stopwords
al
anitz
arabera
asko
baina
bat
batean
batek
bati
batzuei
batzuek
batzuetan
batzuk
bera
beraiek
berau
berauek
bere
berori
beroriek
beste
bezala
da
dago
dira
ditu
du
dute
edo
egin
ere
eta
eurak
ez
gainera
gu
gutxi
guzti
haiei
haiek
haietan
hainbeste
hala
han
handik
hango
hara
hari
hark
hartan
hau
hauei
hauek
hauetan
hemen
hemendik
hemengo
hi
hona
honek
honela
honetan
honi
hor
hori
horiei
horiek
horietan
horko
horra
horrek
horrela
horretan
horri
hortik
hura
izan
ni
noiz
nola
non
nondik
nongo
nor
nora
ze
zein
zen
zenbait
zenbat
zer
zergatik
ziren
zituen
zu
zuek
zuen
zuten

313
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/fa/stopwords.txt

@ -0,0 +1,313 @@
# This file was created by Jacques Savoy and is distributed under the BSD license.
# See http://members.unine.ch/jacques.savoy/clef/index.html.
# Also see http://www.opensource.org/licenses/bsd-license.html
# Note: by default this file is used after normalization, so when adding entries
# to this file, use the arabic 'ي' instead of 'ی'
انان
نداشته
سراسر
خياه
ايشان
وي
تاكنون
بيشتري
دوم
پس
ناشي
وگو
يا
داشتند
سپس
هنگام
هرگز
پنج
نشان
امسال
ديگر
گروهي
شدند
چطور
ده
و
دو
نخستين
ولي
چرا
چه
وسط
ه
كدام
قابل
يك
رفت
هفت
همچنين
در
هزار
بله
بلي
شايد
اما
شناسي
گرفته
دهد
داشته
دانست
داشتن
خواهيم
ميليارد
وقتيكه
امد
خواهد
جز
اورده
شده
بلكه
خدمات
شدن
برخي
نبود
بسياري
جلوگيري
حق
كردند
نوعي
بعري
نكرده
نظير
نبايد
بوده
بودن
داد
اورد
هست
جايي
شود
دنبال
داده
بايد
سابق
هيچ
همان
انجا
كمتر
كجاست
گردد
كسي
تر
مردم
تان
دادن
بودند
سري
جدا
ندارند
مگر
يكديگر
دارد
دهند
بنابراين
هنگامي
سمت
جا
انچه
خود
دادند
زياد
دارند
اثر
بدون
بهترين
بيشتر
البته
به
براساس
بيرون
كرد
بعضي
گرفت
توي
اي
ميليون
او
جريان
تول
بر
مانند
برابر
باشيم
مدتي
گويند
اكنون
تا
تنها
جديد
چند
بي
نشده
كردن
كردم
گويد
كرده
كنيم
نمي
نزد
روي
قصد
فقط
بالاي
ديگران
اين
ديروز
توسط
سوم
ايم
دانند
سوي
استفاده
شما
كنار
داريم
ساخته
طور
امده
رفته
نخست
بيست
نزديك
طي
كنيد
از
انها
تمامي
داشت
يكي
طريق
اش
چيست
روب
نمايد
گفت
چندين
چيزي
تواند
ام
ايا
با
ان
ايد
ترين
اينكه
ديگري
راه
هايي
بروز
همچنان
پاعين
كس
حدود
مختلف
مقابل
چيز
گيرد
ندارد
ضد
همچون
سازي
شان
مورد
باره
مرسي
خويش
برخوردار
چون
خارج
شش
هنوز
تحت
ضمن
هستيم
گفته
فكر
بسيار
پيش
براي
روزهاي
انكه
نخواهد
بالا
كل
وقتي
كي
چنين
كه
گيري
نيست
است
كجا
كند
نيز
يابد
بندي
حتي
توانند
عقب
خواست
كنند
بين
تمام
همه
ما
باشند
مثل
شد
اري
باشد
اره
طبق
بعد
اگر
صورت
غير
جاي
بيش
ريزي
اند
زيرا
چگونه
بار
لطفا
مي
درباره
من
ديده
همين
گذاري
برداري
علت
گذاشته
هم
فوق
نه
ها
شوند
اباد
همواره
هر
اول
خواهند
چهار
نام
امروز
مان
هاي
قبل
كنم
سعي
تازه
را
هستند
زير
جلوي
عنوان
بود

110
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/ga/stopwords.txt

@ -0,0 +1,110 @@
a
ach
ag
agus
an
aon
ar
arna
as
b'
ba
beirt
bhúr
caoga
ceathair
ceathrar
chomh
chtó
chuig
chun
cois
céad
cúig
cúigear
d'
daichead
dar
de
deich
deichniúr
den
dhá
do
don
dtí
dár
faoi
faoin
faoina
faoinár
fara
fiche
gach
gan
go
gur
haon
hocht
i
iad
idir
in
ina
ins
inár
is
le
leis
lena
lenár
m'
mar
mo
na
nach
naoi
naonúr
níor
nócha
ocht
ochtar
os
roimh
sa
seacht
seachtar
seachtó
seasca
seisear
siad
sibh
sinn
sna
tar
thar
thú
triúr
trí
trína
trínár
tríocha
um
ár
é
éis
í
ó
ón
óna
ónár

647
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/gl/galician.rslp

@ -0,0 +1,647 @@
# Steps file for the RSLP stemmer.
# Step 1: Plural Reduction
{ "Plural", 3, 1, {"s"},
# bons -> bon
{"ns",1,"n",{"luns","furatapóns","furatapons"}},
# xamós -> xamón
{"ós",3,"ón"},
# balões -> balón
{"ões",3,"ón"},
# capitães -> capitão
{"ães",1,"ão",{"mães","magalhães"}},
# normais -> normal
{"ais",2,"al",{"cais","tais","mais","pais","ademais"}},
{"áis",2,"al",{"cáis","táis", "máis", "páis", "ademáis"}},
# papéis -> papel
{"éis",2,"el"},
# posíbeis -> posíbel
{"eis",2,"el"},
# espanhóis -> espanhol
{"óis",2,"ol",{"escornabóis"}},
# caracois -> caracol
{"ois",2,"ol",{"escornabois"}},
# cadrís -> cadril
{"ís",2,"il",{"país"}},
# cadris -> cadril
{"is",2,"il",{"menfis","pais","kinguis"}},
# males -> mal
{"les",2,"l",{"ingles","marselles","montreales","senegales","manizales","móstoles","nápoles"}},
# mares -> mar
{"res",3,"r",{"petres","henares","cáceres","baleares","linares","londres","mieres","miraflores","mércores","venres", "pires"}},
# luces -> luz
{"ces",2,"z"},
# luzes -> luz
{"zes",2,"z"},
# leises -> lei
{"ises",3,"z"},
# animás -> animal
{"ás",1,"al",{"más"}},
# gases -> gas
{"ses",2,"s"},
# casas -> casa
{"s",2,"",{"barbadés","barcelonés","cantonés","gabonés","llanés","medinés","escocés","escocês","francês","barcelonês","cantonês","macramés","reves","barcelones","cantones","gabones","llanes","magallanes","medines","escoces","frances","xoves","martes","aliás","pires","lápis","cais","mais","mas","menos","férias","pêsames","crúcis","país","cangas","atenas","asturias","canarias","filipinas","honduras","molucas","caldas","mascareñas","micenas","covarrubias","psoas","óculos","nupcias","xoves","martes","llanes"}}};
{ "Unification", 0, 0, {},
# cansadísimo -> cansadísimo
{"íssimo",5,"ísimo"},
# cansadísima -> cansadísima
{"íssima",5,"ísima"},
# homaço -> homazo
{"aço",4,"azo"},
# mulheraça -> mulheraza
{"aça",4,"aza"},
# xentuça -> xentuza
{"uça",4,"uza"},
# manilhar -> manillar
{"lhar",2,"llar"},
# colher -> coller
{"lher",2,"ller"},
# melhor -> mellor
{"lhor",2,"llor"},
# alho -> allo
{"lho",1,"llo"},
# linhar -> liñar
{"nhar",2,"ñar"},
# penhor -> peñor
{"nhor",2,"ñor"},
# anho -> año
{"nho",1,"ño"},
# cunha -> cuña
{"nha",1,"ña"},
# hospitalário -> hospitalario
{"ário",3,"ario"},
# bibliotecária -> bibliotecaria
{"ária",3,"aria"},
# agradable -> agradábel
{"able",2,"ábel"},
# agradávele -> agradábel
{"ável",2,"ábel"},
# imposible -> imposíbel
{"ible",2,"íbel"},
# imposível -> imposíbel
{"ível",2,"íbel"},
# imposiçom -> imposición
{"çom",2,"ción"},
# garagem -> garaxe
{"agem",2,"axe"},
# garage -> garaxe
{"age",2,"axe"},
# impressão -> impressón
{"ão",3,"ón"},
# irmao -> irmán
{"ao",1,"án"},
# irmau -> irmán
{"au",1,"án"},
# garrafom -> garrafón
{"om",3,"ón"},
# cantem -> canten
{"m",2,"n"}};
{ "Adverb", 0, 0, {},
# felizmente -> feliz
{"mente",4,"",{"experimente","vehemente","sedimente"}}};
{ "Augmentative", 0, 1, {},
# cansadísimo -> cansad
{"dísimo",5},
# cansadísima -> cansad
{"dísima",5},
# amabilísimo -> ama
{"bilísimo",3},
# amabilísima -> ama
{"bilísima",3},
# fortísimo -> fort
{"ísimo",3},
# fortísima -> fort
{"ísima",3},
# centésimo -> cent
{"ésimo",3},
# centésima -> cent
{"ésima",3},
# paupérrimo -> paup
{"érrimo",4},
# paupérrima -> paup
{"érrima",4},
# charlatana -> charlat
{"ana",2,"",{"argana","banana","choupana","espadana","faciana","iguana","lantana","macana","membrana","mesana","nirvana","obsidiana","palangana","pavana","persiana","pestana","porcelana","pseudomembrana","roldana","sábana","salangana","saragana","ventana"}},
# charlatán -> charlat
{"án",3,"",{"ademán","bardán","barregán","corricán","curricán","faisán","furacán","fustán","gabán","gabián","galán","gañán","lavacán","mazán","mourán","rabadán","serán","serrán","tabán","titán","tobogán","verán","volcán","volován"}},
# homazo -> hom
{"azo",4,"",{"abrazo","espazo","andazo","bagazo","balazo","bandazo","cachazo","carazo","denazo","engazo","famazo","lampreazo","pantocazo","pedazo","preñazo","regazo","ribazo","sobrazo","terrazo","trompazo"}},
# mulleraza -> muller
{"aza",3,"",{"alcarraza","ameaza","baraza","broucaza","burgaza","cabaza","cachaza","calaza","carpaza","carraza","coiraza","colmaza","fogaza","famaza","labaza","liñaza","melaza","mordaza","paraza","pinaza","rabaza","rapaza","trancaza"}},
# cascallo -> casc
{"allo",4,"",{"traballo"}},
# xentalla -> xent
{"alla",4},
# bocarra -> boc
{"arra",3,"",{"cigarra","cinzarra"}},
# medicastro -> medic
{"astro",3,"",{"balastro","bimbastro","canastro","retropilastro"}},
# poetastra -> poet
{"astra",3,"",{"banastra","canastra","contrapilastra","piastra","pilastra"}},
# corpázio -> corp
{"ázio",3,"",{"topázio"}},
# soutelo -> sout
{"elo",4,"",{"bacelo","barrelo","bicarelo","biquelo","boquelo","botelo","bouquelo","cacarelo","cachelo","cadrelo","campelo","candelo","cantelo","carabelo","carambelo","caramelo","cercelo","cerebelo","chocarelo","coitelo","conchelo","corbelo","cotobelo","couselo","destelo","desvelo","esfácelo","fandelo","fardelo","farelo","farnelo","flabelo","ganchelo","garfelo","involucelo","mantelo","montelo","outerelo","padicelo","pesadelo","pinguelo","piquelo","rampelo","rastrelo","restelo","tornecelo","trabelo","restrelo","portelo","ourelo","zarapelo"}},
# avioneta -> avion
{"eta",3,"",{"arqueta","atleta","avoceta","baioneta","baldeta","banqueta","barraganeta","barreta","borleta","buceta","caceta","calceta","caldeta","cambeta","canaleta","caneta","carreta","cerceta","chaparreta","chapeta","chareta","chincheta","colcheta","cometa","corbeta","corveta","cuneta","desteta","espeta","espoleta","estafeta","esteta","faceta","falanxeta","frasqueta","gaceta","gabeta","galleta","garabeta","gaveta","glorieta","lagareta","lambeta","lanceta","libreta","maceta","macheta","maleta","malleta","mareta","marreta","meseta","mofeta","muleta","peseta","planeta","raqueta","regreta","saqueta","veleta","vendeta","viñeta"}},
# guapete -> guap
{"ete",3,"",{"alfinete","ariete","bacinete","banquete","barallete","barrete","billete","binguelete","birrete","bonete","bosquete","bufete","burlete","cabalete","cacahuete","cavinete","capacete","carrete","casarete","casete","chupete","clarinete","colchete","colete","capete","curupete","disquete","estilete","falsete","ferrete","filete","gallardete","gobelete","inglete","machete","miquelete","molete","mosquete","piquete","ribete","rodete","rolete","roquete","sorvete","vedete","vendete"}},
# práctica -> práct
{"ica",3,"",{"andarica","botánica","botica","dialéctica","dinámica","física","formica","gráfica","marica","túnica"}},
# práctico -> práct
{"ico",3,"",{"conico","acetifico","acidifico"}},
# trapexo -> trap
{"exo",3,"",{"arpexo","arquexo","asexo","axexo","azulexo","badexo","bafexo","bocexo","bosquexo","boubexo","cacarexo","carrexo","cascarexo","castrexo","convexo","cotexo","desexo","despexo","forcexo","gabexo","gargarexo","gorgolexo","inconexo","manexo","merexo","narnexo","padexo","patexo","sopexo","varexo"}},
{"exa",3,"",{"airexa","bandexa","carrexa","envexa","igrexa","larexa","patexa","presexa","sobexa"}},
# multidão -> mult
{"idão",3},
# pequeniño -> pequeno
{"iño",3,"o",{"camiño","cariño","comiño","golfiño","padriño","sobriño","viciño","veciño"}},
# pequeniña -> pequena
{"iña",3,"a",{"camariña","campiña","entreliña","espiña","fariña","moriña","valiña"}},
# grandito -> grand
{"ito",3,""},
# grandita -> grand
{"ita",3,""},
# anomaloide -> animal
{"oide",3,"",{"anaroide","aneroide","asteroide","axoide","cardioide","celuloide","coronoide","discoide","espermatozoide","espiroide","esquizoide","esteroide","glenoide","linfoide","hemorroide","melaloide","sacaroide","tetraploide","varioloide"}},
# cazola -> caz
{"ola",3,"",{"aixola","ampola","argola","arola","arteríola","bandola","bítola","bractéola","cachola","carambola","carapola","carola","carrandiola","catrapola","cebola","centola","champola","chatola","cirola","cítola","consola","corola","empola","escarola","esmola","estola","fitola","florícola","garañola","gárgola","garxola","glicocola","góndola","mariola","marola","michola","pirola","rebola","rupícola","saxícola","sémola","tachola","tómbola"}},
# pedrolo -> pedr
{"olo",3,"",{"arrolo","babiolo","cacharolo","caixarolo","carolo","carramolo","cascarolo","cirolo","codrolo","correolo","cotrolo","desconsolo","rebolo","repolo","subsolo","tixolo","tómbolo","torolo","trémolo","vacúolo","xermolo","zócolo"}},
# vellote -> vell
{"ote",3,"",{"aigote","alcaiote","barbarote","balote","billote","cachote","camarote","capote","cebote","chichote","citote","cocorote","escote","gañote","garrote","gavote","lamote","lapote","larapote","lingote","lítote","magote","marrote","matalote","pandote","paparote","rebote","tagarote","zarrote"}},
# mozota -> moz
{"ota",3,"",{"asíntota","caiota","cambota","chacota","compota","creosota","curota","derrota","díspota","gamota","maniota","pelota","picota","pillota","pixota","queirota","remota"}},
# gordocho -> gord
{"cho",3,"",{"abrocho","arrocho","carocho","falucho","bombacho","borracho","mostacho"}},
# gordecha -> gord
{"cha",3,"",{"borracha","carracha","estacha","garnacha","limacha","remolacha","abrocha"}},
# baratuco -> barat
{"uco",4,"",{"caduco","estuco","fachuco","malluco","saluco","trabuco"}},
# borrachuzo -> borrach
{"uzo",3,"",{"carriñouzo","fachuzo","mañuzo","mestruzo","tapuzo"}},
# xentuza -> xent
{"uza",3,"",{"barruza","chamuza","chapuza","charamuza","conduza","deduza","desluza","entreluza","induza","reluza","seduza","traduza","trasluza"}},
# babuxa -> bab
{"uxa",3,"",{"caramuxa","carrabouxa","cartuxa","coruxa","curuxa","gaturuxa","maruxa","meruxa","miruxa","moruxa","muruxa","papuxa","rabuxa","trouxa"}},
{"uxo",3,"",{"caramuxo","carouxo","carrabouxo","curuxo","debuxo","ganduxo","influxo","negouxo","pertuxo","refluxo"}},
# grupello -> grup
{"ello",3,"",{"alborello","artello","botello","cachafello","calello","casarello","cazabello","cercello","cocerello","concello","consello","desparello","escaravello","espello","fedello","fervello","gagafello","gorrobello","nortello","pendello","troupello","trebello"}},
# pontella -> pont
{"ella",3,"",{"alborella","bertorella","bocatella","botella","calella","cercella","gadella","grosella","lentella","movella","nocella","noitevella","parella","pelella","percebella","segorella","sabella"}}};
{ "Noun", 0, 0, {},
# lealdade -> leal
{"dade",3,"",{"acridade","calidade"}},
# clarificar -> clar
{"ificar",2},
# brasileiro->brasil
{"eiro",3,"",{"agoireiro","bardalleiro","braseiro","barreiro","canteiro","capoeiro","carneiro","carteiro","cinceiro","faroleiro","mareiro","preguiceiro","quinteiro","raposeiro","retranqueiro","regueiro","sineiro","troleiro","ventureiro"}},
# marisqueira -> marisqu
{"eira",3,"",{"cabeleira","canteira","cocheira","folleira","milleira"}},
# hospitalario -> hospital
{"ario",3,"",{"armario","calcario","lionario","salario"}},
# bibliotecaria -> bibliotec
{"aria",3,"",{"cetaria","coronaria","fumaria","linaria","lunaria","parietaria","saponaria","serpentaria"}},
# humorístico -> humor
{"ístico",3,"",{"balístico", "ensaístico"}},
# castrista -> castr
{"ista",3,"",{"batista","ciclista","fadista","operista","tenista","verista"}},
# lavado -> lav
{"ado",2,"",{"grado","agrado"}},
# decanato -> decan
{"ato",2,"",{"agnato"}},
# xemido -> xem
{"ido",3,"",{"cándido","cândido","consolido","decidido","duvido","marido","rápido"}},
# mantida -> mant
{"ida",3,"",{"bastida","dúbida","dubida","duvida","ermida","éxida","guarida","lapicida","medida","morida"}},
{"ída",3},
# mantído -> mant
{"ido",3},
# orelludo -> orell
{"udo",3,"",{"estudo","escudo"}},
# orelluda -> orell
{"uda",3},
{"ada",3,"",{"abada","alhada","allada","pitada"}},
# comedela -> come
{"dela",3,"",{"cambadela","cavadela","forcadela","erisipidela","mortadela","espadela","fondedela","picadela","arandela","candela","cordela","escudela","pardela"}},
# fontela -> font
{"ela",3,"",{"canela","capela","cotela","cubela","curupela","escarapela","esparrela","estela","fardela","flanela","fornela","franela","gabela","gamela","gavela","glumela","granicela","lamela","lapela","malvela","manela","manganela","mexarela","micela","mistela","novela","ourela","panela","parcela","pasarela","patamela","patela","paxarela","pipela","pitela","postela","pubela","restela","sabela","salmonela","secuela","sentinela","soldanela","subela","temoncela","tesela","tixela","tramela","trapela","varela","vitela","xanela","xestela"}},
# agradábel -> agrad
{"ábel",2,"",{"afábel","fiábel"}},
# combustíbel -> combust
{"íbel",2,"",{"críbel","imposíbel","posíbel","fisíbel","falíbel"}},
# fabricante -> frabrica
{"nte",3,"",{"alimente","adiante","acrescente","elefante","frequente","freqüente","gigante","instante","oriente","permanente","posante","possante","restaurante"}},
# ignorancia -> ignora
{"ncia",3},
# temperanza -> tempera
{"nza",3},
{"acia",3,"",{"acracia","audacia","falacia","farmacia"}},
# inmundicia -> inmund
{"icia",3,"",{"caricia","delicia","ledicia","malicia","milicia","noticia","pericia","presbicia","primicia","regalicia","sevicia","tiricia"}},
# xustiza -> xust
{"iza",3,"",{"alvariza","baliza","cachiza","caniza","cañiza","carbaliza","carriza","chamariza","chapiza","fraguiza","latiza","longaniza","mañiza","nabiza","peliza","preguiza","rabiza"}},
# clarexar -> clar
{"exar",3,"",{"palmexar"}},
# administración -> administr
{"ación",2,"",{"aeración"}},
# expedición -> exped
{"ición",3,"",{"condición","gornición","monición","nutrición","petición","posición","sedición","volición"}},
# excepción -> except
{"ción",3,"t"},
# comprensión -> comprens
{"sión",3,"s",{"abrasión", "alusión"}},
# doazón -> do
{"azón",2,"",{"armazón"}},
# garrafón -> garraf
{"ón",3,"",{"abalón","acordeón","alción","aldrabón","alerón","aliñón","ambón","bombón","calzón","campón","canalón","cantón","capitón","cañón","centón","ciclón","collón","colofón","copón","cotón","cupón","petón","tirón","tourón","turón","unción","versión","zubón","zurrón"}},
# lambona -> lamb
{"ona",3,"",{"abandona","acetona","aleurona","amazona","anémona","bombona","cambona","carona","chacona","charamona","cincona","condona","cortisona","cretona","cretona","detona","estona","fitohormona","fregona","gerona","hidroquinona","hormona","lesiona","madona","maratona","matrona","metadona","monótona","neurona","pamplona","peptona","poltrona","proxesterona","quinona","quinona","silicona","sulfona"}},
# bretoa -> bretón
{"oa",3,"",{"abandoa","madroa","barbacoa","estoa","airoa","eiroa","amalloa","ámboa","améndoa","anchoa","antinéboa","avéntoa","avoa","bágoa","balboa","bisavoa","boroa","canoa","caroa","comadroa","coroa","éngoa","espácoa","filloa","fírgoa","grañoa","lagoa","lanzoa","magoa","mámoa","morzoa","noiteboa","noraboa","parañoa","persoa","queiroa","rañoa","táboa","tataravoa","teiroa"}},
# demoníaco -> demoní
{"aco",3},
# demoníaca -> demoní
{"aca",3,"",{"alpaca","barraca","bullaca","buraca","carraca","casaca","cavaca","cloaca","entresaca","ervellaca","espinaca","estaca","farraca","millaca","pastinaca","pataca","resaca","urraca","purraca"}},
# carballal -> carball
{"al",4,"",{"afinal","animal","estatal","bisexual","bissexual","desleal","fiscal","formal","pessoal","persoal","liberal","postal","virtual","visual","pontual","puntual","homosexual","heterosexual"}},
# nadador -> nada
{"dor",2,"",{"abaixador"}},
# benfeitor -> benfei
{"tor",3,"",{"autor","motor","pastor","pintor"}},
# produtor -> produt
{"or",2,"",{"asesor","assessor","favor","mellor","melhor","redor","rigor","sensor","tambor","tumor"}},
# profesora -> profes
{"ora",3,"",{"albacora","anáfora","áncora","apisoadora","ardora","ascospora","aurora","avéspora","bitácora","canéfora","cantimplora","catáfora","cepilladora","demora","descalcificadora","diáspora","empacadora","epífora","ecavadora","escora","eslora","espora","fotocompoñedora","fotocopiadora","grampadora","isícora","lavadora","lixadora","macrospora","madrépora","madrágora","masora","mellora","metáfora","microspora","milépora","milpéndora","nécora","oospora","padeadora","pasiflora","pécora","píldora","pólvora","ratinadora","rémora","retroescavadora","sófora","torradora","trémbora","uredospora","víbora","víncora","zoospora"}},
# zapataría -> zapat
{"aría",3,"",{"libraría"}},
# etiquetaxe -> etiquet
{"axe",3,"",{"aluaxe","amaraxe","amperaxe","bagaxe","balaxe","barcaxe","borraxe","bescaxe","cabotaxe","carraxe","cartilaxe","chantaxe","colaxe","coraxe","carruaxe","dragaxe","embalaxe","ensilaxe","epistaxe","fagundaxe","fichaxe","fogaxe","forraxe","fretaxe","friaxe","garaxe","homenaxe","leitaxe","liñaxe","listaxe","maraxe","marcaxe","maridaxe","masaxe","miraxe","montaxe","pasaxe","peaxe","portaxe","ramaxe","rebelaxe","rodaxe","romaxe","sintaxe","sondaxe","tiraxe","vantaxe","vendaxe","viraxe"}},
# movedizo -> move
{"dizo",3},
# limpeza -> limp
{"eza",3,"",{"alteza","beleza","fereza","fineza","vasteza","vileza"}},
# rixidez -> rixid
{"ez",3,"",{"acidez","adultez","adustez","avidez","candidez","mudez","nenez","nudez","pomez"}},
# mullerengo -> muller
{"engo",3},
# chairego -> chair
{"ego",3,"",{"corego","derrego","entrego","lamego","sarego","sartego"}},
# cariñoso -> cariñ
{"oso",3,"",{"afanoso","algoso","caldoso","caloso","cocoso","ditoso","favoso","fogoso","lamoso","mecoso","mocoso","precioso","rixoso","venoso","viroso","xesoso"}},
# cariñosa -> cariñ
{"osa",3,"",{"mucosa","glicosa","baldosa","celulosa","isoglosa","nitrocelulosa","levulosa","ortosa","pectosa","preciosa","sacarosa","serosa","ventosa"}},
# negrume -> negr
{"ume",3,"",{"agrume","albume","alcume","batume","cacume","cerrume","chorume","churume","costume","curtume","estrume","gafume","legume","perfume","queixume","zarrume"}},
# altura -> alt
{"ura",3,"",{"albura","armadura","imatura","costura"}},
# cuspiñar -> cusp
{"iñar",3},
# febril -> febr
{"il",3,"",{"abril","alfil","anil","atril","badil","baril","barril","brasil","cadril","candil","cantil","carril","chamil","chancil","civil","cubil","dátil","difícil","dócil","edil","estéril","fácil","fráxil","funil","fusil","grácil","gradil","hábil","hostil","marfil"}},
# principesco -> princip
{"esco",4},
# mourisco -> mour
{"isco",4},
# esportivo -> esport
{"ivo",3,"",{"pasivo","positivo","passivo","possessivo","posesivo","pexotarivo","relativo"}}};
{ "Verb", 0, 0, {},
# amaba -> am
{"aba",2},
# andabade -> and
{"abade",2},
# andábade -> and
{"ábade",2},
# chorabamo -> chor
{"abamo",2},
# chorábamo -> chor
{"ábamo",2},
# moraban -> morab
{"aban",2},
# andache -> and
{"ache",2},
# andade -> and
{"ade",2},
{"an",2},
# cantando -> cant
{"ando",2},
# cantar -> cant
{"ar",2,"",{"azar","bazar","patamar"}},
# lembrarade -> lembra
{"arade",2},
{"aramo",2},
{"arán",2},
# cantaran -> cant
{"aran",2},
# convidárade -> convid
{"árade",2},
# convidaría -> convid
{"aría",2},
# cantariade -> cant
{"ariade",2},
# cantaríade -> cant
{"aríade",2},
# cantarian -> cant
{"arian",2},
# cantariamo -> cant
{"ariamo",2},
# pescaron -> pesc
{"aron",2},
# cantase -> cant
{"ase",2},
# cantasede -> cant
{"asede",2},
# cantásede -> cant
{"ásede",2},
# cantasemo -> cant
{"asemo",2},
# cantásemo -> cant
{"ásemo",2},
# cantasen -> cant
{"asen",2},
# loitavan -> loitav
{"avan",2},
# cantaríamo -> cant
{"aríamo",2},
# cantassen -> cant
{"assen",2},
# cantássemo -> cant
{"ássemo",2},
# beberíamo -> beb
{"eríamo",2},
# bebêssemo -> beb
{"êssemo",2},
# partiríamo -> part
{"iríamo",3},
# partíssemo -> part
{"íssemo",3},
# cantáramo -> cant
{"áramo",2},
# cantárei -> cant
{"árei",2},
# cantaren -> cant
{"aren",2},
# cantaremo -> cant
{"aremo",2},
# cantaríei -> cant
{"aríei",2},
{"ássei",2},
# cantávamo-> cant
{"ávamo",2},
# bebêramo -> beb
{"êramo",1},
# beberemo -> beb
{"eremo",1},
# beberíei -> beb
{"eríei",1},
# bebêssei -> beb
{"êssei",1},
# partiríamo -> part
{"íramo",3},
# partiremo -> part
{"iremo",3},
# partiríei -> part
{"iríei",3},
# partíssei -> part
{"íssei",3},
# partissen -> part
{"issen",3},
# bebendo -> beb
{"endo",1},
# partindo -> part
{"indo",3},
# propondo -> prop
{"ondo",3},
# cantarde -> cant
{"arde",2},
# cantarei -> cant
{"arei",2},
# cantaria -> cant
{"aria",2},
# cantarmo -> cant
{"armo",2},
# cantasse -> cant
{"asse",2},
{"aste",2},
# cantávei -> cant
{"ávei",2},
# perderão -> perd
{"erão",1},
# beberde -> beb
{"erde",1},
# beberei -> beb
{"erei",1},
# bebêrei -> beb
{"êrei",1},
# beberen -> beb
{"eren",2},
# beberia -> beb
{"eria",1},
# bebermo -> beb
{"ermo",1},
# bebeste -> beb
{"este",1,"",{"faroeste","agreste"}},
# bebíamo -> beb
{"íamo",1},
# fuxian -> fux
{"ian",2,"",{"enfian","eloxian","ensaian"}},
# partirde -> part
{"irde",2},
# partírei -> part
{"irei",3,"",{"admirei"}},
# partiren -> part
{"iren",3},
# partiria -> part
{"iria",3},
# partirmo -> part
{"irmo",3},
# partisse -> part
{"isse",3},
# partiste -> part
{"iste",4},
{"iava",1,"",{"ampliava"}},
# cantamo -> cant
{"amo",2},
# funciona -> func
{"iona",3},
# cantara -> cant
{"ara",2,"",{"arara","prepara"}},
# enviará -> envi
{"ará",2,"",{"alvará","bacará"}},
# cantare -> cant
{"are",2,"",{"prepare"}},
# cantava -> cant
{"ava",2,"",{"agrava"}},
# cantemo -> cant
{"emo",2},
# bebera -> beb
{"era",1,"",{"acelera","espera"}},
# beberá -> beb
{"erá",1},
# bebere -> beb
{"ere",1,"",{"espere"}},
# bebíei -> beb
{"íei",1},
# metin -> met
{"in",3},
# partimo -> part
{"imo",3,"",{"reprimo","intimo","íntimo","nimo","queimo","ximo"}},
# partira -> part
{"ira",3,"",{"fronteira","sátira"}},
{"ído",3},
# partirá -> part
{"irá",3},
# concretizar -> concret
{"tizar",4,"",{"alfabetizar"}},
{"izar",3,"",{"organizar"}},
# saltitar -> salt
{"itar",5,"",{"acreditar","explicitar","estreitar"}},
# partire -> part
{"ire",3,"",{"adquire"}},
# compomo -> comp
{"omo",3},
{"ai",2},
# barbear -> barb
{"ear",4,"",{"alardear","nuclear"}},
# cheguei -> cheg
{"uei",3},
{"uía",5,"u"},
# cantei -> cant
{"ei",3},
# beber -> beb
{"er",1,"",{"éter","pier"}},
# bebeu -> beb
{"eu",1,"",{"chapeu"}},
# bebia -> beb
{"ia",1,"",{"estória","fatia","acia","praia","elogia","mania","lábia","aprecia","polícia","arredia","cheia","ásia"}},
# partir -> part
{"ir",3},
# partiu -> part
{"iu",3},
# fraqueou -> fraqu
{"eou",5},
# chegou -> cheg
{"ou",3},
# bebi -> beb
{"i",1},
# varrede -> varr
{"ede",1,"",{"rede","bípede","céspede","parede","palmípede","vostede","hóspede","adrede"}},
# cantei -> cant
{"ei",3},
# anden -> and
{"en",2},
# descerade -> desc
{"erade",1},
# vivérade -> viv
{"érade",1},
# beberan -> beb
{"eran",2},
# colleramo -> coller
{"eramo",1},
# bebéramo -> beb
{"éramo",1},
# perderán -> perd
{"erán",1},
# varrería -> varr
{"ería",1},
# beberiade -> beb
{"eriade",1},
# beberíade -> beb
{"eríade",1},
# beberiamo -> beb
{"eriamo",1},
# beberian -> beb
{"erian",1},
# beberían -> beb
{"erían",1},
# perderon -> perd
{"eron",1},
# bebese -> beb
{"ese",1},
# bebesedes -> beb
{"esedes",1},
# bebésedes -> beb
{"ésedes",1},
# bebesemo -> beb
{"esemo",1},
# bebésemo -> beb
{"ésemo",1},
# bebesen -> beb
{"esen",1},
# bebêssede -> beb
{"êssede",1},
# chovía -> chov
{"ía",1},
# faciade -> fac
{"iade",1},
# facíade -> fac
{"íade",1},
# perdiamo -> perd
{"iamo",1},
# fuxían -> fux
{"ían",1},
# corriche -> corr
{"iche",1},
# partide -> part
{"ide",1},
# escribirade -> escrib
{"irade",3},
# parírade -> par
{"írade",3},
# partiramo -> part
{"iramo",3},
# fugirán -> fug
{"irán",3},
# viviría -> viv
{"iría",3},
# partiriade -> part
{"iriade",3},
# partiríade -> part
{"iríade",3},
# partiriamo -> part
{"iriamo",3},
# partirian -> part
{"irian",3},
# partirían -> part
{"irían",3},
# reflectiron -> reflect
{"iron",3},
# partise -> part
{"ise",3},
# partisede -> part
{"isede",3},
# partísede -> part
{"ísede",3},
# partisemo -> part
{"isemo",3},
# partísemo -> part
{"ísemo",3},
# partisen -> part
{"isen",3},
# partíssede -> part
{"íssede",3},
{"tizar",3,"",{"alfabetizar"}},
{"ondo",3}};
{ "Vowel", 0, 0, {},
# segue -> seg
{"gue",2,"g",{"azougue","dengue","merengue","nurague","merengue","rengue"}},
{"que",2,"c",{"alambique","albaricoque","abaroque","alcrique","almadraque","almanaque","arenque","arinque","baduloque","ballestrinque","betoque","bivaque","bloque","bodaque","bosque","breque","buque","cacique","cheque","claque","contradique","coque","croque","dique","duque","enroque","espeque","estoque","estoraque","estraloque","estrinque","milicroque","monicreque","orinque","arinque","palenque","parque","penique","picabeque","pique","psique","raque","remolque","xeque","repenique","roque","sotobosque","tabique","tanque","toque","traque","truque","vivaque","xaque"}},
{"a",3,"",{"amasadela","cerva"}},
{"e",3,"",{"marte"}},
{"o",3,"",{"barro","fado","cabo","libro","cervo"}},
{"â",3},
{"ã",3,"",{"amanhã","arapuã","fã","divã","manhã"}},
{"ê",3},
{"ô",3},
{"á",3},
{"é",3},
{"ó",3},
# munxi -> munx
{"i",3}};

161
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/gl/stopwords.txt

@ -0,0 +1,161 @@
# galican stopwords
a
aínda
alí
aquel
aquela
aquelas
aqueles
aquilo
aquí
ao
aos
as
así
á
ben
cando
che
co
coa
comigo
con
connosco
contigo
convosco
coas
cos
cun
cuns
cunha
cunhas
da
dalgunha
dalgunhas
dalgún
dalgúns
das
de
del
dela
delas
deles
desde
deste
do
dos
dun
duns
dunha
dunhas
e
el
ela
elas
eles
en
era
eran
esa
esas
ese
eses
esta
estar
estaba
está
están
este
estes
estiven
estou
eu
é
facer
foi
foron
fun
había
hai
iso
isto
la
las
lle
lles
lo
los
mais
me
meu
meus
min
miña
miñas
moi
na
nas
neste
nin
no
non
nos
nosa
nosas
noso
nosos
nós
nun
nunha
nuns
nunhas
o
os
ou
ó
ós
para
pero
pode
pois
pola
polas
polo
polos
por
que
se
senón
ser
seu
seus
sexa
sido
sobre
súa
súas
tamén
tan
te
ten
teñen
teño
ter
teu
teus
ti
tido
tiña
tiven
túa
túas
un
unha
unhas
uns
vos
vosa
vosas
voso
vosos
vós

235
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/hi/stopwords.txt

@ -0,0 +1,235 @@
# Also see http://www.opensource.org/licenses/bsd-license.html
# See http://members.unine.ch/jacques.savoy/clef/index.html.
# This file was created by Jacques Savoy and is distributed under the BSD license.
# Note: by default this file also contains forms normalized by HindiNormalizer
# for spelling variation (see section below), such that it can be used whether or
# not you enable that feature. When adding additional entries to this list,
# please add the normalized form as well.
दर
अत
अपन
अपन
अपन
अभ
आदि
आप
इति
इन
इनक
इन
इन
इन
इस
इसक
इसक
इसक
इसम
इस
इस
उन
उनक
उनक
उनक
उनक
उन
उन
उन
उस
उसक
उस
उस
एक
एव
एस
ऐस
और
कई
कर
करत
करत
करन
करन
कर
कहत
कह
ि
ितन
ि
ि
ि
ि
ि
ि
ि
नस
गय
घर
जब
जह
ितन
ि
ि
ि
ि
ि
धर
तक
तब
तरह
ि
ि
ि
ि
ि
दब
ि
सर
सर
नह
ियत
पर
पर
पहल
ि
बन
बह
बह
िलक
तर
मगर
यदि
यह
यह
यह
ि
रख
रह
रह
ि
ि
ि
वर
वह
वह
वह
वह
वग़रह
सकत
सकत
सबस
सभ
# additional normalized forms of the above
अपनि
ि
सभि
ि
दव
इसि
ि
ि
ओर
ि
वहि
अभि
बनि
ि
ि
वगरह
एस
रव
ि
ि
उसि
ितर
बहि
वह
यह
ि
ि
िि
कइ
यहि
ि
िधर
अदि
इतयि
नस
इसकि
सर
जह
अप
ि
उनकि
ि
वरग
नहि

46
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/hy/stopwords.txt

@ -0,0 +1,46 @@
# example set of Armenian stopwords.
այդ
այլ
այն
այս
դու
դուք
եմ
են
ենք
ես
եք
է
էի
էին
էինք
էիր
էիք
էր
ըստ
թ
ի
ին
իսկ
իր
կամ
համար
հետ
հետո
մենք
մեջ
մի
ն
նա
նաև
նրա
նրանք
որ
որը
որոնք
որպես
ու
ում
պիտի
վրա
և

359
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/id/stopwords.txt

@ -0,0 +1,359 @@
# from appendix D of: A Study of Stemming Effects on Information
# Retrieval in Bahasa Indonesia
ada
adanya
adalah
adapun
agak
agaknya
agar
akan
akankah
akhirnya
aku
akulah
amat
amatlah
anda
andalah
antar
diantaranya
antara
antaranya
diantara
apa
apaan
mengapa
apabila
apakah
apalagi
apatah
atau
ataukah
ataupun
bagai
bagaikan
sebagai
sebagainya
bagaimana
bagaimanapun
sebagaimana
bagaimanakah
bagi
bahkan
bahwa
bahwasanya
sebaliknya
banyak
sebanyak
beberapa
seberapa
begini
beginian
beginikah
beginilah
sebegini
begitu
begitukah
begitulah
begitupun
sebegitu
belum
belumlah
sebelum
sebelumnya
sebenarnya
berapa
berapakah
berapalah
berapapun
betulkah
sebetulnya
biasa
biasanya
bila
bilakah
bisa
bisakah
sebisanya
boleh
bolehkah
bolehlah
buat
bukan
bukankah
bukanlah
bukannya
cuma
percuma
dahulu
dalam
dan
dapat
dari
daripada
dekat
demi
demikian
demikianlah
sedemikian
dengan
depan
di
dia
dialah
dini
diri
dirinya
terdiri
dong
dulu
enggak
enggaknya
entah
entahlah
terhadap
terhadapnya
hal
hampir
hanya
hanyalah
harus
haruslah
harusnya
seharusnya
hendak
hendaklah
hendaknya
hingga
sehingga
ia
ialah
ibarat
ingin
inginkah
inginkan
ini
inikah
inilah
itu
itukah
itulah
jangan
jangankan
janganlah
jika
jikalau
juga
justru
kala
kalau
kalaulah
kalaupun
kalian
kami
kamilah
kamu
kamulah
kan
kapan
kapankah
kapanpun
dikarenakan
karena
karenanya
ke
kecil
kemudian
kenapa
kepada
kepadanya
ketika
seketika
khususnya
kini
kinilah
kiranya
sekiranya
kita
kitalah
kok
lagi
lagian
selagi
lah
lain
lainnya
melainkan
selaku
lalu
melalui
terlalu
lama
lamanya
selama
selama
selamanya
lebih
terlebih
bermacam
macam
semacam
maka
makanya
makin
malah
malahan
mampu
mampukah
mana
manakala
manalagi
masih
masihkah
semasih
masing
mau
maupun
semaunya
memang
mereka
merekalah
meski
meskipun
semula
mungkin
mungkinkah
nah
namun
nanti
nantinya
nyaris
oleh
olehnya
seorang
seseorang
pada
padanya
padahal
paling
sepanjang
pantas
sepantasnya
sepantasnyalah
para
pasti
pastilah
per
pernah
pula
pun
merupakan
rupanya
serupa
saat
saatnya
sesaat
saja
sajalah
saling
bersama
sama
sesama
sambil
sampai
sana
sangat
sangatlah
saya
sayalah
se
sebab
sebabnya
sebuah
tersebut
tersebutlah
sedang
sedangkan
sedikit
sedikitnya
segala
segalanya
segera
sesegera
sejak
sejenak
sekali
sekalian
sekalipun
sesekali
sekaligus
sekarang
sekarang
sekitar
sekitarnya
sela
selain
selalu
seluruh
seluruhnya
semakin
sementara
sempat
semua
semuanya
sendiri
sendirinya
seolah
seperti
sepertinya
sering
seringnya
serta
siapa
siapakah
siapapun
disini
disinilah
sini
sinilah
sesuatu
sesuatunya
suatu
sesudah
sesudahnya
sudah
sudahkah
sudahlah
supaya
tadi
tadinya
tak
tanpa
setelah
telah
tentang
tentu
tentulah
tentunya
tertentu
seterusnya
tapi
tetapi
setiap
tiap
setidaknya
tidak
tidakkah
tidaklah
toh
waduh
wah
wahai
sewaktu
walau
walaupun
wong
yaitu
yakni
yang

172
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/lv/stopwords.txt

@ -0,0 +1,172 @@
# Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins
# the original list of over 800 forms was refined:
# pronouns, adverbs, interjections were removed
#
# prepositions
aiz
ap
ar
apakš
ārpus
augšpus
bez
caur
dēļ
gar
iekš
iz
kopš
labad
lejpus
līdz
no
otrpus
pa
par
pār
pēc
pie
pirms
pret
priekš
starp
šaipus
uz
viņpus
virs
virspus
zem
apakšpus
# Conjunctions
un
bet
jo
ja
ka
lai
tomēr
tikko
turpretī
arī
kaut
gan
tādēļ
ne
tikvien
vien
ir
te
vai
kamēr
# Particles
ar
diezin
droši
diemžēl
nebūt
ik
it
taču
nu
pat
tiklab
iekšpus
nedz
tik
nevis
turpretim
jeb
iekam
iekām
iekāms
kolīdz
līdzko
tiklīdz
jebšu
tālab
tāpēc
nekā
itin
jau
jel
nezin
tad
tikai
vis
tak
iekams
vien
# modal verbs
būt
biju
biji
bija
bijām
bijāt
esmu
esi
esam
esat
būšu
būsi
būs
būsim
būsiet
tikt
tiku
tiki
tika
tikām
tikāt
tieku
tiec
tiek
tiekam
tiekat
tikšu
tiks
tiksim
tiksiet
tapt
tapi
tapāt
topat
tapšu
tapsi
taps
tapsim
tapsiet
kļūt
kļuvu
kļuvi
kļuva
kļuvām
kļuvāt
kļūstu
kļūsti
kļūst
kļūstam
kļūstat
kļūšu
kļūsi
kļūs
kļūsim
kļūsiet
# verbs
varēt
varēju
varējām
varēšu
varēsim
var
varēji
varējāt
varēsi
varēsiet
varat
varēja
varēs

456
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/pt/portuguese.rslp

@ -0,0 +1,456 @@
# Steps file for the RSLP stemmer.
# Step 1: Plural Reduction
{ "Plural", 3, 1, {"s"},
# bons -> bom
{"ns",1,"m"},
# balões -> balão
{"ões",3,"ão"},
# capitães -> capitão
{"ães",1,"ão",{"mães"}},
# normais -> normal
{"ais",1,"al",{"cais","mais"}},
# papéis -> papel
{"éis",2,"el"},
# amáveis -> amável
{"eis",2,"el"},
# lençóis -> lençol
{"óis",2,"ol"},
# barris -> barril
{"is",2,"il",{"lápis","cais","mais","crúcis","biquínis","pois","depois","dois","leis"}},
# males -> mal
{"les",3,"l"},
# mares -> mar
{"res",3,"r", {"árvores"}},
# casas -> casa
{"s",2,"",{"aliás","pires","lápis","cais","mais","mas","menos","férias","fezes","pêsames","crúcis","gás","atrás","moisés","através","convés","ês","país","após","ambas","ambos","messias", "depois"}}};
# Step 2: Adverb Reduction
{ "Adverb", 0, 0, {},
# felizmente -> feliz
{"mente",4,"",{"experimente"}}};
# Step 3: Feminine Reduction
{ "Feminine", 3, 1, {"a","ã"},
# chefona -> chefão
{"ona",3,"ão",{"abandona","lona","iona","cortisona","monótona","maratona","acetona","detona","carona"}},
# vilã -> vilão
{"ã",2,"ão",{"amanhã","arapuã","fã","divã"}},
# professora -> professor
{"ora",3,"or"},
# americana -> americano
{"na",4,"no",{"carona","abandona","lona","iona","cortisona","monótona","maratona","acetona","detona","guiana","campana","grana","caravana","banana","paisana"}},
# sozinha -> sozinho
{"inha",3,"inho",{"rainha","linha","minha"}},
# inglesa -> inglês
{"esa",3,"ês",{"mesa","obesa","princesa","turquesa","ilesa","pesa","presa"}},
# famosa -> famoso
{"osa",3,"oso",{"mucosa","prosa"}},
# maníaca -> maníaco
{"íaca",3,"íaco"},
# prática -> prático
{"ica",3,"ico",{"dica"}},
# cansada -> cansado
{"ada",2,"ado",{"pitada"}},
# mantida -> mantido
{"ida",3,"ido",{"vida","dúvida"}},
{"ída",3,"ido",{"recaída","saída"}},
# prima -> primo
{"ima",3,"imo",{"vítima"}},
# passiva -> passivo
{"iva",3,"ivo",{"saliva","oliva"}},
# primeira -> primeiro
{"eira",3,"eiro",{"beira","cadeira","frigideira","bandeira","feira","capoeira","barreira","fronteira","besteira","poeira"}}};
# Step 4: Augmentative/Diminutive Reduction
{ "Augmentative", 0, 1, {},
# cansadíssimo -> cansad
{"díssimo",5},
# amabilíssimo -> ama
{"abilíssimo",5},
# fortíssimo -> fort
{"íssimo",3},
{"ésimo",3},
# chiquérrimo -> chiqu
{"érrimo",4},
# pezinho -> pe
{"zinho",2},
# maluquinho -> maluc
{"quinho",4,"c"},
# amiguinho -> amig
{"uinho",4},
# cansadinho -> cansad
{"adinho",3},
# carrinho -> carr
{"inho",3,"",{"caminho","cominho"}},
# grandalhão -> grand
{"alhão",4},
# dentuça -> dent
{"uça",4},
# ricaço -> ric
{"aço",4,"",{"antebraço"}},
{"aça",4},
# casadão -> cans
{"adão",4},
{"idão",4},
# corpázio -> corp
{"ázio",3,"",{"topázio"}},
# pratarraz -> prat
{"arraz",4},
{"zarrão",3},
{"arrão",4},
# bocarra -> boc
{"arra",3},
# calorzão -> calor
{"zão",2,"",{"coalizão"}},
# meninão -> menin
{"ão",3,"",{"camarão","chimarrão","canção","coração","embrião","grotão","glutão","ficção","fogão","feição","furacão","gamão","lampião","leão","macacão","nação","órfão","orgão","patrão","portão","quinhão","rincão","tração","falcão","espião","mamão","folião","cordão","aptidão","campeão","colchão","limão","leilão","melão","barão","milhão","bilhão","fusão","cristão","ilusão","capitão","estação","senão"}}};
# Step 5: Noun Suffix Reduction
{ "Noun", 0, 0, {},
# existencialista -> exist
{"encialista",4},
# minimalista -> minim
{"alista",5},
# contagem -> cont
{"agem",3,"",{"coragem","chantagem","vantagem","carruagem"}},
# gerenciamento -> gerenc
{"iamento",4},
# monitoramento -> monitor
{"amento",3,"",{"firmamento","fundamento","departamento"}},
# nascimento -> nasc
{"imento",3},
{"mento",6,"",{"firmamento","elemento","complemento","instrumento","departamento"}},
# comercializado -> comerci
{"alizado",4},
# traumatizado -> traum
{"atizado",4},
{"tizado",4,"",{"alfabetizado"}},
# alfabetizado -> alfabet
{"izado",5,"",{"organizado","pulverizado"}},
# associativo -> associ
{"ativo",4,"",{"pejorativo","relativo"}},
# contraceptivo -> contracep
{"tivo",4,"",{"relativo"}},
# esportivo -> esport
{"ivo",4,"",{"passivo","possessivo","pejorativo","positivo"}},
# abalado -> abal
{"ado",2,"",{"grado"}},
# impedido -> imped
{"ido",3,"",{"cândido","consolido","rápido","decido","tímido","duvido","marido"}},
# ralador -> ral
{"ador",3},
# entendedor -> entend
{"edor",3},
# cumpridor -> cumpr
{"idor",4,"",{"ouvidor"}},
{"dor",4,"",{"ouvidor"}},
{"sor",4,"",{"assessor"}},
{"atoria",5},
{"tor",3,"",{"benfeitor","leitor","editor","pastor","produtor","promotor","consultor"}},
{"or",2,"",{"motor","melhor","redor","rigor","sensor","tambor","tumor","assessor","benfeitor","pastor","terior","favor","autor"}},
# comparabilidade -> compar
{"abilidade",5},
# abolicionista -> abol
{"icionista",4},
# intervencionista -> interven
{"cionista",5},
{"ionista",5},
{"ionar",5},
# profissional -> profiss
{"ional",4},
# referência -> refer
{"ência",3},
# repugnância -> repugn
{"ância",4,"",{"ambulância"}},
# abatedouro -> abat
{"edouro",3},
# fofoqueiro -> fofoc
{"queiro",3,"c"},
{"adeiro",4,"",{"desfiladeiro"}},
# brasileiro -> brasil
{"eiro",3,"",{"desfiladeiro","pioneiro","mosteiro"}},
{"uoso",3},
# gostoso -> gost
{"oso",3,"",{"precioso"}},
# comercializaç -> comerci
{"alizaç",5},
{"atizaç",5},
{"tizaç",5},
{"izaç",5,"",{"organizaç"}},
# alegaç -> aleg
{"aç",3,"",{"equaç","relaç"}},
# aboliç -> abol
{"iç",3,"",{"eleiç"}},
# anedotário -> anedot
{"ário",3,"",{"voluntário","salário","aniversário","diário","lionário","armário"}},
{"atório",3},
{"rio",5,"",{"voluntário","salário","aniversário","diário","compulsório","lionário","próprio","stério","armário"}},
# ministério -> minist
{"ério",6},
# chinês -> chin
{"ês",4},
# beleza -> bel
{"eza",3},
# rigidez -> rigid
{"ez",4},
# parentesco -> parent
{"esco",4},
# ocupante -> ocup
{"ante",2,"",{"gigante","elefante","adiante","possante","instante","restaurante"}},
# bombástico -> bomb
{"ástico",4,"",{"eclesiástico"}},
{"alístico",3},
{"áutico",4},
{"êutico",4},
{"tico",3,"",{"político","eclesiástico","diagnostico","prático","doméstico","diagnóstico","idêntico","alopático","artístico","autêntico","eclético","crítico","critico"}},
# polêmico -> polêm
{"ico",4,"",{"tico","público","explico"}},
# produtividade -> produt
{"ividade",5},
# profundidade -> profund
{"idade",4,"",{"autoridade","comunidade"}},
# aposentadoria -> aposentad
{"oria",4,"",{"categoria"}},
# existencial -> exist
{"encial",5},
# artista -> art
{"ista",4},
{"auta",5},
# maluquice -> maluc
{"quice",4,"c"},
# chatice -> chat
{"ice",4,"",{"cúmplice"}},
# demoníaco -> demon
{"íaco",3},
# decorrente -> decorr
{"ente",4,"",{"freqüente","alimente","acrescente","permanente","oriente","aparente"}},
{"ense",5},
# criminal -> crim
{"inal",3},
# americano -> americ
{"ano",4},
# amável -> am
{"ável",2,"",{"afável","razoável","potável","vulnerável"}},
# combustível -> combust
{"ível",3,"",{"possível"}},
{"vel",5,"",{"possível","vulnerável","solúvel"}},
{"bil",3,"vel"},
# cobertura -> cobert
{"ura",4,"",{"imatura","acupuntura","costura"}},
{"ural",4},
# consensual -> consens
{"ual",3,"",{"bissexual","virtual","visual","pontual"}},
# mundial -> mund
{"ial",3},
# experimental -> experiment
{"al",4,"",{"afinal","animal","estatal","bissexual","desleal","fiscal","formal","pessoal","liberal","postal","virtual","visual","pontual","sideral","sucursal"}},
{"alismo",4},
{"ivismo",4},
{"ismo",3,"",{"cinismo"}}};
# Step 6: Verb Suffix Reduction
{ "Verb", 0, 0, {},
# cantaríamo -> cant
{"aríamo",2},
# cantássemo -> cant
{"ássemo",2},
# beberíamo -> beb
{"eríamo",2},
# bebêssemo -> beb
{"êssemo",2},
# partiríamo -> part
{"iríamo",3},
# partíssemo -> part
{"íssemo",3},
# cantáramo -> cant
{"áramo",2},
# cantárei -> cant
{"árei",2},
# cantaremo -> cant
{"aremo",2},
# cantariam -> cant
{"ariam",2},
# cantaríei -> cant
{"aríei",2},
# cantássei -> cant
{"ássei",2},
# cantassem -> cant
{"assem",2},
# cantávamo -> cant
{"ávamo",2},
# bebêramo -> beb
{"êramo",3},
# beberemo -> beb
{"eremo",3},
# beberiam -> beb
{"eriam",3},
# beberíei -> beb
{"eríei",3},
# bebêssei -> beb
{"êssei",3},
# bebessem -> beb
{"essem",3},
# partiríamo -> part
{"íramo",3},
# partiremo -> part
{"iremo",3},
# partiriam -> part
{"iriam",3},
# partiríei -> part
{"iríei",3},
# partíssei -> part
{"íssei",3},
# partissem -> part
{"issem",3},
# cantando -> cant
{"ando",2},
# bebendo -> beb
{"endo",3},
# partindo -> part
{"indo",3},
# propondo -> prop
{"ondo",3},
# cantaram -> cant
{"aram",2},
{"arão",2},
# cantarde -> cant
{"arde",2},
# cantarei -> cant
{"arei",2},
# cantarem -> cant
{"arem",2},
# cantaria -> cant
{"aria",2},
# cantarmo -> cant
{"armo",2},
# cantasse -> cant
{"asse",2},
# cantaste -> cant
{"aste",2},
# cantavam -> cant
{"avam",2,"",{"agravam"}},
# cantávei -> cant
{"ávei",2},
# beberam -> beb
{"eram",3},
{"erão",3},
# beberde -> beb
{"erde",3},
# beberei -> beb
{"erei",3},
# bebêrei -> beb
{"êrei",3},
# beberem -> beb
{"erem",3},
# beberia -> beb
{"eria",3},
# bebermo -> beb
{"ermo",3},
# bebesse -> beb
{"esse",3},
# bebeste -> beb
{"este",3,"",{"faroeste","agreste"}},
# bebíamo -> beb
{"íamo",3},
# partiram -> part
{"iram",3},
# concluíram -> conclu
{"íram",3},
{"irão",2},
# partirde -> part
{"irde",2},
# partírei -> part
{"irei",3,"",{"admirei"}},
# partirem -> part
{"irem",3,"",{"adquirem"}},
# partiria -> part
{"iria",3},
# partirmo -> part
{"irmo",3},
# partisse -> part
{"isse",3},
# partiste -> part
{"iste",4},
{"iava",4,"",{"ampliava"}},
# cantamo -> cant
{"amo",2},
{"iona",3},
# cantara -> cant
{"ara",2,"",{"arara","prepara"}},
# cantará -> cant
{"ará",2,"",{"alvará"}},
# cantare -> cant
{"are",2,"",{"prepare"}},
# cantava -> cant
{"ava",2,"",{"agrava"}},
# cantemo -> cant
{"emo",2},
# bebera -> beb
{"era",3,"",{"acelera","espera"}},
# beberá -> beb
{"erá",3},
# bebere -> beb
{"ere",3,"",{"espere"}},
# bebiam -> beb
{"iam",3,"",{"enfiam","ampliam","elogiam","ensaiam"}},
# bebíei -> beb
{"íei",3},
# partimo -> part
{"imo",3,"",{"reprimo","intimo","íntimo","nimo","queimo","ximo"}},
# partira -> part
{"ira",3,"",{"fronteira","sátira"}},
{"ído",3},
# partirá -> part
{"irá",3},
{"tizar",4,"",{"alfabetizar"}},
{"izar",5,"",{"organizar"}},
{"itar",5,"",{"acreditar","explicitar","estreitar"}},
# partire -> part
{"ire",3,"",{"adquire"}},
# compomo -> comp
{"omo",3},
# cantai -> cant
{"ai",2},
# cantam -> cant
{"am",2},
# barbear -> barb
{"ear",4,"",{"alardear","nuclear"}},
# cantar -> cant
{"ar",2,"",{"azar","bazaar","patamar"}},
# cheguei -> cheg
{"uei",3},
{"uía",5,"u"},
# cantei -> cant
{"ei",3},
{"guem",3,"g"},
# cantem -> cant
{"em",2,"",{"alem","virgem"}},
# beber -> beb
{"er",2,"",{"éter","pier"}},
# bebeu -> beb
{"eu",3,"",{"chapeu"}},
# bebia -> beb
{"ia",3,"",{"estória","fatia","acia","praia","elogia","mania","lábia","aprecia","polícia","arredia","cheia","ásia"}},
# partir -> part
{"ir",3,"",{"freir"}},
# partiu -> part
{"iu",3},
{"eou",5},
# chegou -> cheg
{"ou",3},
# bebi -> beb
{"i",3}};
# Step 7: Vowel Removal
{ "Vowel", 0, 0, {},
{"bil",2,"vel"},
{"gue",2,"g",{"gangue","jegue"}},
{"á",3},
{"ê",3,"",{"bebê"}},
# menina -> menin
{"a",3,"",{"ásia"}},
# grande -> grand
{"e",3},
# menino -> menin
{"o",3,"",{"ão"}}};

233
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/ro/stopwords.txt

@ -0,0 +1,233 @@
# This file was created by Jacques Savoy and is distributed under the BSD license.
# See http://members.unine.ch/jacques.savoy/clef/index.html.
# Also see http://www.opensource.org/licenses/bsd-license.html
acea
aceasta
această
aceea
acei
aceia
acel
acela
acele
acelea
acest
acesta
aceste
acestea
aceşti
aceştia
acolo
acum
ai
aia
aibă
aici
al
ăla
ale
alea
ălea
altceva
altcineva
am
ar
are
aşadar
asemenea
asta
ăsta
astăzi
astea
ăstea
ăştia
asupra
aţi
au
avea
avem
aveţi
azi
bine
bucur
bună
ca
căci
când
care
cărei
căror
cărui
cât
câte
câţi
către
câtva
ce
cel
ceva
chiar
cînd
cine
cineva
cît
cîte
cîţi
cîtva
contra
cu
cum
cumva
curând
curînd
da
dacă
dar
datorită
de
deci
deja
deoarece
departe
deşi
din
dinaintea
dintr
dintre
drept
după
ea
ei
el
ele
eram
este
eşti
eu
face
fără
fi
fie
fiecare
fii
fim
fiţi
iar
ieri
îi
îl
îmi
împotriva
în
înainte
înaintea
încât
încît
încotro
între
întrucât
întrucît
îţi
la
lângă
le
li
lîngă
lor
lui
mâine
mea
mei
mele
mereu
meu
mi
mine
mult
multă
mulţi
ne
nicăieri
nici
nimeni
nişte
noastră
noastre
noi
noştri
nostru
nu
ori
oricând
oricare
oricât
orice
oricînd
oricine
oricît
oricum
oriunde
până
pe
pentru
peste
pînă
poate
pot
prea
prima
primul
prin
printr
sa
săi
sale
sau
său
se
şi
sînt
sîntem
sînteţi
spre
sub
sunt
suntem
sunteţi
ta
tăi
tale
tău
te
ţi
ţie
tine
toată
toate
tot
toţi
totuşi
tu
un
una
unde
undeva
unei
unele
uneori
unor
vi
voastră
voastre
voi
voştri
vostru
vouă
vreo
vreun

108
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/danish_stop.txt

@ -0,0 +1,108 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| A Danish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| This is a ranked list (commonest to rarest) of stopwords derived from
| a large text sample.
og | and
i | in
jeg | I
det | that (dem. pronoun)/it (pers. pronoun)
at | that (in front of a sentence)/to (with infinitive)
en | a/an
den | it (pers. pronoun)/that (dem. pronoun)
til | to/at/for/until/against/by/of/into, more
er | present tense of "to be"
som | who, as
på | on/upon/in/on/at/to/after/of/with/for, on
de | they
med | with/by/in, along
han | he
af | of/by/from/off/for/in/with/on, off
for | at/for/to/from/by/of/ago, in front/before, because
ikke | not
der | who/which, there/those
var | past tense of "to be"
mig | me/myself
sig | oneself/himself/herself/itself/themselves
men | but
et | a/an/one, one (number), someone/somebody/one
har | present tense of "to have"
om | round/about/for/in/a, about/around/down, if
vi | we
min | my
havde | past tense of "to have"
ham | him
hun | she
nu | now
over | over/above/across/by/beyond/past/on/about, over/past
da | then, when/as/since
fra | from/off/since, off, since
du | you
ud | out
sin | his/her/its/one's
dem | them
os | us/ourselves
op | up
man | you/one
hans | his
hvor | where
eller | or
hvad | what
skal | must/shall etc.
selv | myself/youself/herself/ourselves etc., even
her | here
alle | all/everyone/everybody etc.
vil | will (verb)
blev | past tense of "to stay/to remain/to get/to become"
kunne | could
ind | in
når | when
være | present tense of "to be"
dog | however/yet/after all
noget | something
ville | would
jo | you know/you see (adv), yes
deres | their/theirs
efter | after/behind/according to/for/by/from, later/afterwards
ned | down
skulle | should
denne | this
end | than
dette | this
mit | my/mine
også | also
under | under/beneath/below/during, below/underneath
have | have
dig | you
anden | other
hende | her
mine | my
alt | everything
meget | much/very, plenty of
sit | his, her, its, one's
sine | his, her, its, one's
vor | our
mod | against
disse | these
hvis | if
din | your/yours
nogle | some
hos | by/at
blive | be/become
mange | many
ad | by/through
bliver | present tense of "to be/to become"
hendes | her/hers
været | be
thi | for (conj)
jer | you
sådan | such, like this/like that

117
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/dutch_stop.txt

@ -0,0 +1,117 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/dutch/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| A Dutch stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| This is a ranked list (commonest to rarest) of stopwords derived from
| a large sample of Dutch text.
| Dutch stop words frequently exhibit homonym clashes. These are indicated
| clearly below.
de | the
en | and
van | of, from
ik | I, the ego
te | (1) chez, at etc, (2) to, (3) too
dat | that, which
die | that, those, who, which
in | in, inside
een | a, an, one
hij | he
het | the, it
niet | not, nothing, naught
zijn | (1) to be, being, (2) his, one's, its
is | is
was | (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river
op | on, upon, at, in, up, used up
aan | on, upon, to (as dative)
met | with, by
als | like, such as, when
voor | (1) before, in front of, (2) furrow
had | had, past tense all persons sing. of 'hebben' (have)
er | there
maar | but, only
om | round, about, for etc
hem | him
dan | then
zou | should/would, past tense all persons sing. of 'zullen'
of | or, whether, if
wat | what, something, anything
mijn | possessive and noun 'mine'
men | people, 'one'
dit | this
zo | so, thus, in this way
door | through by
over | over, across
ze | she, her, they, them
zich | oneself
bij | (1) a bee, (2) by, near, at
ook | also, too
tot | till, until
je | you
mij | me
uit | out of, from
der | Old Dutch form of 'van der' still found in surnames
daar | (1) there, (2) because
haar | (1) her, their, them, (2) hair
naar | (1) unpleasant, unwell etc, (2) towards, (3) as
heb | present first person sing. of 'to have'
hoe | how, why
heeft | present third person sing. of 'to have'
hebben | 'to have' and various parts thereof
deze | this
u | you
want | (1) for, (2) mitten, (3) rigging
nog | yet, still
zal | 'shall', first and third person sing. of verb 'zullen' (will)
me | me
zij | she, they
nu | now
ge | 'thou', still used in Belgium and south Netherlands
geen | none
omdat | because
iets | something, somewhat
worden | to become, grow, get
toch | yet, still
al | all, every, each
waren | (1) 'were' (2) to wander, (3) wares, (3)
veel | much, many
meer | (1) more, (2) lake
doen | to do, to make
toen | then, when
moet | noun 'spot/mote' and present form of 'to must'
ben | (1) am, (2) 'are' in interrogative second person singular of 'to be'
zonder | without
kan | noun 'can' and present form of 'to be able'
hun | their, them
dus | so, consequently
alles | all, everything, anything
onder | under, beneath
ja | yes, of course
eens | once, one day
hier | here
wie | who
werd | imperfect third person sing. of 'become'
altijd | always
doch | yet, but etc
wordt | present third person sing. of 'become'
wezen | (1) to be, (2) 'been' as in 'been fishing', (3) orphans
kunnen | to be able
ons | us/our
zelf | self
tegen | against, towards, at
na | after, near
reeds | already
wil | (1) present tense of 'want', (2) 'will', noun, (3) fender
kon | could; past tense of 'to be able'
niets | nothing
uw | your
iemand | somebody
geweest | been; past participle of 'be'
andere | other

317
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/english_stop.txt

@ -0,0 +1,317 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/english/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| An English stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| Many of the forms below are quite rare (e.g. "yourselves") but included for
| completeness.
| PRONOUNS FORMS
| 1st person sing
i | subject, always in upper case of course
me | object
my | possessive adjective
| the possessive pronoun `mine' is best suppressed, because of the
| sense of coal-mine etc.
myself | reflexive
| 1st person plural
we | subject
| us | object
| care is required here because US = United States. It is usually
| safe to remove it if it is in lower case.
our | possessive adjective
ours | possessive pronoun
ourselves | reflexive
| second person (archaic `thou' forms not included)
you | subject and object
your | possessive adjective
yours | possessive pronoun
yourself | reflexive (singular)
yourselves | reflexive (plural)
| third person singular
he | subject
him | object
his | possessive adjective and pronoun
himself | reflexive
she | subject
her | object and possessive adjective
hers | possessive pronoun
herself | reflexive
it | subject and object
its | possessive adjective
itself | reflexive
| third person plural
they | subject
them | object
their | possessive adjective
theirs | possessive pronoun
themselves | reflexive
| other forms (demonstratives, interrogatives)
what
which
who
whom
this
that
these
those
| VERB FORMS (using F.R. Palmer's nomenclature)
| BE
am | 1st person, present
is | -s form (3rd person, present)
are | present
was | 1st person, past
were | past
be | infinitive
been | past participle
being | -ing form
| HAVE
have | simple
has | -s form
had | past
having | -ing form
| DO
do | simple
does | -s form
did | past
doing | -ing form
| The forms below are, I believe, best omitted, because of the significant
| homonym forms:
| He made a WILL
| old tin CAN
| merry month of MAY
| a smell of MUST
| fight the good fight with all thy MIGHT
| would, could, should, ought might however be included
| | AUXILIARIES
| | WILL
|will
would
| | SHALL
|shall
should
| | CAN
|can
could
| | MAY
|may
|might
| | MUST
|must
| | OUGHT
ought
| COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing
| pronoun + verb
i'm
you're
he's
she's
it's
we're
they're
i've
you've
we've
they've
i'd
you'd
he'd
she'd
we'd
they'd
i'll
you'll
he'll
she'll
we'll
they'll
| verb + negation
isn't
aren't
wasn't
weren't
hasn't
haven't
hadn't
doesn't
don't
didn't
| auxiliary + negation
won't
wouldn't
shan't
shouldn't
can't
cannot
couldn't
mustn't
| miscellaneous forms
let's
that's
who's
what's
here's
there's
when's
where's
why's
how's
| rarer forms
| daren't needn't
| doubtful forms
| oughtn't mightn't
| ARTICLES
a
an
the
| THE REST (Overlap among prepositions, conjunctions, adverbs etc is so
| high, that classification is pointless.)
and
but
if
or
because
as
until
while
of
at
by
for
with
about
against
between
into
through
during
before
after
above
below
to
from
up
down
in
out
on
off
over
under
again
further
then
once
here
there
when
where
why
how
all
any
both
each
few
more
most
other
some
such
no
nor
not
only
own
same
so
than
too
very
| Just for the record, the following words are among the commonest in English
| one
| every
| least
| less
| many
| now
| ever
| never
| say
| says
| said
| also
| get
| go
| goes
| just
| made
| make
| put
| see
| seen
| whether
| like
| well
| back
| even
| still
| way
| take
| since
| another
| however
| two
| three
| four
| five
| first
| second
| new
| old
| high
| long

95
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/finnish_stop.txt

@ -0,0 +1,95 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| forms of BE
olla
olen
olet
on
olemme
olette
ovat
ole | negative form
oli
olisi
olisit
olisin
olisimme
olisitte
olisivat
olit
olin
olimme
olitte
olivat
ollut
olleet
en | negation
et
ei
emme
ette
eivät
|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans
minä minun minut minua minussa minusta minuun minulla minulta minulle | I
sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you
hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she
me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we
te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you
he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they
tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this
tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that
se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it
nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these
nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those
ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they
kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who
ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl)
mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what
mitkä | (pl)
joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which
jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl)
| conjunctions
että | that
ja | and
jos | if
koska | because
kuin | than
mutta | but
niin | so
sekä | and
sillä | for
tai | or
vaan | but
vai | or
vaikka | although
| prepositions
kanssa | with
mukaan | according to
noin | about
poikki | across
yli | over, across
| other
kun | when
niin | so
nyt | now
itse | self

183
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/french_stop.txt

@ -0,0 +1,183 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/french/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| A French stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
au | a + le
aux | a + les
avec | with
ce | this
ces | these
dans | with
de | of
des | de + les
du | de + le
elle | she
en | `of them' etc
et | and
eux | them
il | he
je | I
la | the
le | the
leur | their
lui | him
ma | my (fem)
mais | but
me | me
même | same; as in moi-même (myself) etc
mes | me (pl)
moi | me
mon | my (masc)
ne | not
nos | our (pl)
notre | our
nous | we
on | one
ou | where
par | by
pas | not
pour | for
qu | que before vowel
que | that
qui | who
sa | his, her (fem)
se | oneself
ses | his (pl)
son | his, her (masc)
sur | on
ta | thy (fem)
te | thee
tes | thy (pl)
toi | thee
ton | thy (masc)
tu | thou
un | a
une | a
vos | your (pl)
votre | your
vous | you
| single letter forms
c | c'
d | d'
j | j'
l | l'
à | to, at
m | m'
n | n'
s | s'
t | t'
y | there
| forms of être (not including the infinitive):
été
étée
étées
étés
étant
suis
es
est
sommes
êtes
sont
serai
seras
sera
serons
serez
seront
serais
serait
serions
seriez
seraient
étais
était
étions
étiez
étaient
fus
fut
fûmes
fûtes
furent
sois
soit
soyons
soyez
soient
fusse
fusses
fût
fussions
fussiez
fussent
| forms of avoir (not including the infinitive):
ayant
eu
eue
eues
eus
ai
as
avons
avez
ont
aurai
auras
aura
aurons
aurez
auront
aurais
aurait
aurions
auriez
auraient
avais
avait
avions
aviez
avaient
eut
eûmes
eûtes
eurent
aie
aies
ait
ayons
ayez
aient
eusse
eusses
eût
eussions
eussiez
eussent
| Later additions (from Jean-Christophe Deschamps)
ceci | this
celà  | that
cet | this
cette | this
ici | here
ils | they
les | the (pl)
leurs | their (pl)
quel | which
quels | which
quelle | which
quelles | which
sans | without
soi | oneself

292
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/german_stop.txt

@ -0,0 +1,292 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| A German stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| The number of forms in this list is reduced significantly by passing it
| through the German stemmer.
aber | but
alle | all
allem
allen
aller
alles
als | than, as
also | so
am | an + dem
an | at
ander | other
andere
anderem
anderen
anderer
anderes
anderm
andern
anderr
anders
auch | also
auf | on
aus | out of
bei | by
bin | am
bis | until
bist | art
da | there
damit | with it
dann | then
der | the
den
des
dem
die
das
daß | that
derselbe | the same
derselben
denselben
desselben
demselben
dieselbe
dieselben
dasselbe
dazu | to that
dein | thy
deine
deinem
deinen
deiner
deines
denn | because
derer | of those
dessen | of him
dich | thee
dir | to thee
du | thou
dies | this
diese
diesem
diesen
dieser
dieses
doch | (several meanings)
dort | (over) there
durch | through
ein | a
eine
einem
einen
einer
eines
einig | some
einige
einigem
einigen
einiger
einiges
einmal | once
er | he
ihn | him
ihm | to him
es | it
etwas | something
euer | your
eure
eurem
euren
eurer
eures
für | for
gegen | towards
gewesen | p.p. of sein
hab | have
habe | have
haben | have
hat | has
hatte | had
hatten | had
hier | here
hin | there
hinter | behind
ich | I
mich | me
mir | to me
ihr | you, to her
ihre
ihrem
ihren
ihrer
ihres
euch | to you
im | in + dem
in | in
indem | while
ins | in + das
ist | is
jede | each, every
jedem
jeden
jeder
jedes
jene | that
jenem
jenen
jener
jenes
jetzt | now
kann | can
kein | no
keine
keinem
keinen
keiner
keines
können | can
könnte | could
machen | do
man | one
manche | some, many a
manchem
manchen
mancher
manches
mein | my
meine
meinem
meinen
meiner
meines
mit | with
muss | must
musste | had to
nach | to(wards)
nicht | not
nichts | nothing
noch | still, yet
nun | now
nur | only
ob | whether
oder | or
ohne | without
sehr | very
sein | his
seine
seinem
seinen
seiner
seines
selbst | self
sich | herself
sie | they, she
ihnen | to them
sind | are
so | so
solche | such
solchem
solchen
solcher
solches
soll | shall
sollte | should
sondern | but
sonst | else
über | over
um | about, around
und | and
uns | us
unse
unsem
unsen
unser
unses
unter | under
viel | much
vom | von + dem
von | from
vor | before
während | while
war | was
waren | were
warst | wast
was | what
weg | away, off
weil | because
weiter | further
welche | which
welchem
welchen
welcher
welches
wenn | when
werde | will
werden | will
wie | how
wieder | again
will | want
wir | we
wird | will
wirst | willst
wo | where
wollen | want
wollte | wanted
würde | would
würden | would
zu | to
zum | zu + dem
zur | zu + der
zwar | indeed
zwischen | between

209
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/hungarian_stop.txt

@ -0,0 +1,209 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| Hungarian stop word list
| prepared by Anna Tordai
a
ahogy
ahol
aki
akik
akkor
alatt
által
általában
amely
amelyek
amelyekben
amelyeket
amelyet
amelynek
ami
amit
amolyan
amíg
amikor
át
abban
ahhoz
annak
arra
arról
az
azok
azon
azt
azzal
azért
aztán
azután
azonban
bár
be
belül
benne
cikk
cikkek
cikkeket
csak
de
e
eddig
egész
egy
egyes
egyetlen
egyéb
egyik
egyre
ekkor
el
elég
ellen
elő
először
előtt
első
én
éppen
ebben
ehhez
emilyen
ennek
erre
ez
ezt
ezek
ezen
ezzel
ezért
és
fel
felé
hanem
hiszen
hogy
hogyan
igen
így
illetve
ill.
ill
ilyen
ilyenkor
ison
ismét
itt
jól
jobban
kell
kellett
keresztül
keressünk
ki
kívül
között
közül
legalább
lehet
lehetett
legyen
lenne
lenni
lesz
lett
maga
magát
majd
majd
már
más
másik
meg
még
mellett
mert
mely
melyek
mi
mit
míg
miért
milyen
mikor
minden
mindent
mindenki
mindig
mint
mintha
mivel
most
nagy
nagyobb
nagyon
ne
néha
nekem
neki
nem
néhány
nélkül
nincs
olyan
ott
össze
ő
ők
őket
pedig
persze
s
saját
sem
semmi
sok
sokat
sokkal
számára
szemben
szerint
szinte
talán
tehát
teljes
tovább
továbbá
több
úgy
ugyanis
új
újabb
újra
után
utána
utolsó
vagy
vagyis
valaki
valami
valamint
való
vagyok
van
vannak
volt
voltam
voltak
voltunk
vissza
vele
viszont
volna

301
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/italian_stop.txt

@ -0,0 +1,301 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/italian/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| An Italian stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
ad | a (to) before vowel
al | a + il
allo | a + lo
ai | a + i
agli | a + gli
all | a + l'
agl | a + gl'
alla | a + la
alle | a + le
con | with
col | con + il
coi | con + i (forms collo, cogli etc are now very rare)
da | from
dal | da + il
dallo | da + lo
dai | da + i
dagli | da + gli
dall | da + l'
dagl | da + gll'
dalla | da + la
dalle | da + le
di | of
del | di + il
dello | di + lo
dei | di + i
degli | di + gli
dell | di + l'
degl | di + gl'
della | di + la
delle | di + le
in | in
nel | in + el
nello | in + lo
nei | in + i
negli | in + gli
nell | in + l'
negl | in + gl'
nella | in + la
nelle | in + le
su | on
sul | su + il
sullo | su + lo
sui | su + i
sugli | su + gli
sull | su + l'
sugl | su + gl'
sulla | su + la
sulle | su + le
per | through, by
tra | among
contro | against
io | I
tu | thou
lui | he
lei | she
noi | we
voi | you
loro | they
mio | my
mia |
miei |
mie |
tuo |
tua |
tuoi | thy
tue |
suo |
sua |
suoi | his, her
sue |
nostro | our
nostra |
nostri |
nostre |
vostro | your
vostra |
vostri |
vostre |
mi | me
ti | thee
ci | us, there
vi | you, there
lo | him, the
la | her, the
li | them
le | them, the
gli | to him, the
ne | from there etc
il | the
un | a
uno | a
una | a
ma | but
ed | and
se | if
perché | why, because
anche | also
come | how
dov | where (as dov')
dove | where
che | who, that
chi | who
cui | whom
non | not
più | more
quale | who, that
quanto | how much
quanti |
quanta |
quante |
quello | that
quelli |
quella |
quelle |
questo | this
questi |
questa |
queste |
si | yes
tutto | all
tutti | all
| single letter forms:
a | at
c | as c' for ce or ci
e | and
i | the
l | as l'
o | or
| forms of avere, to have (not including the infinitive):
ho
hai
ha
abbiamo
avete
hanno
abbia
abbiate
abbiano
avrò
avrai
avrà
avremo
avrete
avranno
avrei
avresti
avrebbe
avremmo
avreste
avrebbero
avevo
avevi
aveva
avevamo
avevate
avevano
ebbi
avesti
ebbe
avemmo
aveste
ebbero
avessi
avesse
avessimo
avessero
avendo
avuto
avuta
avuti
avute
| forms of essere, to be (not including the infinitive):
sono
sei
è
siamo
siete
sia
siate
siano
sarò
sarai
sarà
saremo
sarete
saranno
sarei
saresti
sarebbe
saremmo
sareste
sarebbero
ero
eri
era
eravamo
eravate
erano
fui
fosti
fu
fummo
foste
furono
fossi
fosse
fossimo
fossero
essendo
| forms of fare, to do (not including the infinitive, fa, fat-):
faccio
fai
facciamo
fanno
faccia
facciate
facciano
farò
farai
farà
faremo
farete
faranno
farei
faresti
farebbe
faremmo
fareste
farebbero
facevo
facevi
faceva
facevamo
facevate
facevano
feci
facesti
fece
facemmo
faceste
fecero
facessi
facesse
facessimo
facessero
facendo
| forms of stare, to be (not including the infinitive):
sto
stai
sta
stiamo
stanno
stia
stiate
stiano
starò
starai
starà
staremo
starete
staranno
starei
staresti
starebbe
staremmo
stareste
starebbero
stavo
stavi
stava
stavamo
stavate
stavano
stetti
stesti
stette
stemmo
steste
stettero
stessi
stesse
stessimo
stessero
stando

192
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/norwegian_stop.txt

@ -0,0 +1,192 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/norwegian/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| A Norwegian stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| This stop word list is for the dominant bokmål dialect. Words unique
| to nynorsk are marked *.
| Revised by Jan Bruusgaard <Jan.Bruusgaard@ssb.no>, Jan 2005
og | and
i | in
jeg | I
det | it/this/that
at | to (w. inf.)
en | a/an
et | a/an
den | it/this/that
til | to
er | is/am/are
som | who/that
på | on
de | they / you(formal)
med | with
han | he
av | of
ikke | not
ikkje | not *
der | there
så | so
var | was/were
meg | me
seg | you
men | but
ett | one
har | have
om | about
vi | we
min | my
mitt | my
ha | have
hadde | had
hun | she
nå | now
over | over
da | when/as
ved | by/know
fra | from
du | you
ut | out
sin | your
dem | them
oss | us
opp | up
man | you/one
kan | can
hans | his
hvor | where
eller | or
hva | what
skal | shall/must
selv | self (reflective)
sjøl | self (reflective)
her | here
alle | all
vil | will
bli | become
ble | became
blei | became *
blitt | have become
kunne | could
inn | in
når | when
være | be
kom | come
noen | some
noe | some
ville | would
dere | you
som | who/which/that
deres | their/theirs
kun | only/just
ja | yes
etter | after
ned | down
skulle | should
denne | this
for | for/because
deg | you
si | hers/his
sine | hers/his
sitt | hers/his
mot | against
å | to
meget | much
hvorfor | why
dette | this
disse | these/those
uten | without
hvordan | how
ingen | none
din | your
ditt | your
blir | become
samme | same
hvilken | which
hvilke | which (plural)
sånn | such a
inni | inside/within
mellom | between
vår | our
hver | each
hvem | who
vors | us/ours
hvis | whose
både | both
bare | only/just
enn | than
fordi | as/because
før | before
mange | many
også | also
slik | just
vært | been
være | to be
båe | both *
begge | both
siden | since
dykk | your *
dykkar | yours *
dei | they *
deira | them *
deires | theirs *
deim | them *
di | your (fem.) *
då | as/when *
eg | I *
ein | a/an *
eit | a/an *
eitt | a/an *
elles | or *
honom | he *
hjå | at *
ho | she *
hoe | she *
henne | her
hennar | her/hers
hennes | hers
hoss | how *
hossen | how *
ikkje | not *
ingi | noone *
inkje | noone *
korleis | how *
korso | how *
kva | what/which *
kvar | where *
kvarhelst | where *
kven | who/whom *
kvi | why *
kvifor | why *
me | we *
medan | while *
mi | my *
mine | my *
mykje | much *
no | now *
nokon | some (masc./neut.) *
noka | some (fem.) *
nokor | some *
noko | some *
nokre | some *
si | his/hers *
sia | since *
sidan | since *
so | so *
somt | some *
somme | some *
um | about*
upp | up *
vere | be *
vore | was *
verte | become *
vort | become *
varte | became *
vart | became *

251
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/portuguese_stop.txt

@ -0,0 +1,251 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/portuguese/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| A Portuguese stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| The following is a ranked list (commonest to rarest) of stopwords
| deriving from a large sample of text.
| Extra words have been added at the end.
de | of, from
a | the; to, at; her
o | the; him
que | who, that
e | and
do | de + o
da | de + a
em | in
um | a
para | for
| é from SER
com | with
não | not, no
uma | a
os | the; them
no | em + o
se | himself etc
na | em + a
por | for
mais | more
as | the; them
dos | de + os
como | as, like
mas | but
| foi from SER
ao | a + o
ele | he
das | de + as
| tem from TER
à | a + a
seu | his
sua | her
ou | or
| ser from SER
quando | when
muito | much
| há from HAV
nos | em + os; us
já | already, now
| está from EST
eu | I
também | also
só | only, just
pelo | per + o
pela | per + a
até | up to
isso | that
ela | he
entre | between
| era from SER
depois | after
sem | without
mesmo | same
aos | a + os
| ter from TER
seus | his
quem | whom
nas | em + as
me | me
esse | that
eles | they
| estão from EST
você | you
| tinha from TER
| foram from SER
essa | that
num | em + um
nem | nor
suas | her
meu | my
às | a + as
minha | my
| têm from TER
numa | em + uma
pelos | per + os
elas | they
| havia from HAV
| seja from SER
qual | which
| será from SER
nós | we
| tenho from TER
lhe | to him, her
deles | of them
essas | those
esses | those
pelas | per + as
este | this
| fosse from SER
dele | of him
| other words. There are many contractions such as naquele = em+aquele,
| mo = me+o, but they are rare.
| Indefinite article plural forms are also rare.
tu | thou
te | thee
vocês | you (plural)
vos | you
lhes | to them
meus | my
minhas
teu | thy
tua
teus
tuas
nosso | our
nossa
nossos
nossas
dela | of her
delas | of them
esta | this
estes | these
estas | these
aquele | that
aquela | that
aqueles | those
aquelas | those
isto | this
aquilo | that
| forms of estar, to be (not including the infinitive):
estou
está
estamos
estão
estive
esteve
estivemos
estiveram
estava
estávamos
estavam
estivera
estivéramos
esteja
estejamos
estejam
estivesse
estivéssemos
estivessem
estiver
estivermos
estiverem
| forms of haver, to have (not including the infinitive):
hei
havemos
hão
houve
houvemos
houveram
houvera
houvéramos
haja
hajamos
hajam
houvesse
houvéssemos
houvessem
houver
houvermos
houverem
houverei
houverá
houveremos
houverão
houveria
houveríamos
houveriam
| forms of ser, to be (not including the infinitive):
sou
somos
são
era
éramos
eram
fui
foi
fomos
foram
fora
fôramos
seja
sejamos
sejam
fosse
fôssemos
fossem
for
formos
forem
serei
será
seremos
serão
seria
seríamos
seriam
| forms of ter, to have (not including the infinitive):
tenho
tem
temos
tém
tinha
tínhamos
tinham
tive
teve
tivemos
tiveram
tivera
tivéramos
tenha
tenhamos
tenham
tivesse
tivéssemos
tivessem
tiver
tivermos
tiverem
terei
terá
teremos
terão
teria
teríamos
teriam

241
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/russian_stop.txt

@ -0,0 +1,241 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/russian/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| a russian stop word list. comments begin with vertical bar. each stop
| word is at the start of a line.
| this is a ranked list (commonest to rarest) of stopwords derived from
| a large text sample.
| letter `ё' is translated to `е'.
и | and
в | in/into
во | alternative form
не | not
что | what/that
он | he
на | on/onto
я | i
с | from
со | alternative form
как | how
а | milder form of `no' (but)
то | conjunction and form of `that'
все | all
она | she
так | so, thus
его | him
но | but
да | yes/and
ты | thou
к | towards, by
у | around, chez
же | intensifier particle
вы | you
за | beyond, behind
бы | conditional/subj. particle
по | up to, along
только | only
ее | her
мне | to me
было | it was
вот | here is/are, particle
от | away from
меня | me
еще | still, yet, more
нет | no, there isnt/arent
о | about
из | out of
ему | to him
теперь | now
когда | when
даже | even
ну | so, well
вдруг | suddenly
ли | interrogative particle
если | if
уже | already, but homonym of `narrower'
или | or
ни | neither
быть | to be
был | he was
него | prepositional form of его
до | up to
вас | you accusative
нибудь | indef. suffix preceded by hyphen
опять | again
уж | already, but homonym of `adder'
вам | to you
сказал | he said
ведь | particle `after all'
там | there
потом | then
себя | oneself
ничего | nothing
ей | to her
может | usually with `быть' as `maybe'
они | they
тут | here
где | where
есть | there is/are
надо | got to, must
ней | prepositional form of ей
для | for
мы | we
тебя | thee
их | them, their
чем | than
была | she was
сам | self
чтоб | in order to
без | without
будто | as if
человек | man, person, one
чего | genitive form of `what'
раз | once
тоже | also
себе | to oneself
под | beneath
жизнь | life
будет | will be
ж | short form of intensifer particle `же'
тогда | then
кто | who
этот | this
говорил | was saying
того | genitive form of `that'
потому | for that reason
этого | genitive form of `this'
какой | which
совсем | altogether
ним | prepositional form of `его', `они'
здесь | here
этом | prepositional form of `этот'
один | one
почти | almost
мой | my
тем | instrumental/dative plural of `тот', `то'
чтобы | full form of `in order that'
нее | her (acc.)
кажется | it seems
сейчас | now
были | they were
куда | where to
зачем | why
сказать | to say
всех | all (acc., gen. preposn. plural)
никогда | never
сегодня | today
можно | possible, one can
при | by
наконец | finally
два | two
об | alternative form of `о', about
другой | another
хоть | even
после | after
над | above
больше | more
тот | that one (masc.)
через | across, in
эти | these
нас | us
про | about
всего | in all, only, of all
них | prepositional form of `они' (they)
какая | which, feminine
много | lots
разве | interrogative particle
сказала | she said
три | three
эту | this, acc. fem. sing.
моя | my, feminine
впрочем | moreover, besides
хорошо | good
свою | ones own, acc. fem. sing.
этой | oblique form of `эта', fem. `this'
перед | in front of
иногда | sometimes
лучше | better
чуть | a little
том | preposn. form of `that one'
нельзя | one must not
такой | such a one
им | to them
более | more
всегда | always
конечно | of course
всю | acc. fem. sing of `all'
между | between
| b: some paradigms
|
| personal pronouns
|
| я меня мне мной [мною]
| ты тебя тебе тобой [тобою]
| он его ему им [него, нему, ним]
| она ее эи ею [нее, нэи, нею]
| оно его ему им [него, нему, ним]
|
| мы нас нам нами
| вы вас вам вами
| они их им ими [них, ним, ними]
|
| себя себе собой [собою]
|
| demonstrative pronouns: этот (this), тот (that)
|
| этот эта это эти
| этого эты это эти
| этого этой этого этих
| этому этой этому этим
| этим этой этим [этою] этими
| этом этой этом этих
|
| тот та то те
| того ту то те
| того той того тех
| тому той тому тем
| тем той тем [тою] теми
| том той том тех
|
| determinative pronouns
|
| (a) весь (all)
|
| весь вся все все
| всего всю все все
| всего всей всего всех
| всему всей всему всем
| всем всей всем [всею] всеми
| всем всей всем всех
|
| (b) сам (himself etc)
|
| сам сама само сами
| самого саму само самих
| самого самой самого самих
| самому самой самому самим
| самим самой самим [самою] самими
| самом самой самом самих
|
| stems of verbs `to be', `to have', `to do' and modal
|
| быть бы буд быв есть суть
| име
| дел
| мог мож мочь
| уме
| хоч хот
| долж
| можн
| нужн
| нельзя

354
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/spanish_stop.txt

@ -0,0 +1,354 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| A Spanish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| The following is a ranked list (commonest to rarest) of stopwords
| deriving from a large sample of text.
| Extra words have been added at the end.
de | from, of
la | the, her
que | who, that
el | the
en | in
y | and
a | to
los | the, them
del | de + el
se | himself, from him etc
las | the, them
por | for, by, etc
un | a
para | for
con | with
no | no
una | a
su | his, her
al | a + el
| es from SER
lo | him
como | how
más | more
pero | pero
sus | su plural
le | to him, her
ya | already
o | or
| fue from SER
este | this
| ha from HABER
sí | himself etc
porque | because
esta | this
| son from SER
entre | between
| está from ESTAR
cuando | when
muy | very
sin | without
sobre | on
| ser from SER
| tiene from TENER
también | also
me | me
hasta | until
hay | there is/are
donde | where
| han from HABER
quien | whom, that
| están from ESTAR
| estado from ESTAR
desde | from
todo | all
nos | us
durante | during
| estados from ESTAR
todos | all
uno | a
les | to them
ni | nor
contra | against
otros | other
| fueron from SER
ese | that
eso | that
| había from HABER
ante | before
ellos | they
e | and (variant of y)
esto | this
mí | me
antes | before
algunos | some
qué | what?
unos | a
yo | I
otro | other
otras | other
otra | other
él | he
tanto | so much, many
esa | that
estos | these
mucho | much, many
quienes | who
nada | nothing
muchos | many
cual | who
| sea from SER
poco | few
ella | she
estar | to be
| haber from HABER
estas | these
| estaba from ESTAR
| estamos from ESTAR
algunas | some
algo | something
nosotros | we
| other forms
mi | me
mis | mi plural
tú | thou
te | thee
ti | thee
tu | thy
tus | tu plural
ellas | they
nosotras | we
vosotros | you
vosotras | you
os | you
mío | mine
mía |
míos |
mías |
tuyo | thine
tuya |
tuyos |
tuyas |
suyo | his, hers, theirs
suya |
suyos |
suyas |
nuestro | ours
nuestra |
nuestros |
nuestras |
vuestro | yours
vuestra |
vuestros |
vuestras |
esos | those
esas | those
| forms of estar, to be (not including the infinitive):
estoy
estás
está
estamos
estáis
están
esté
estés
estemos
estéis
estén
estaré
estarás
estará
estaremos
estaréis
estarán
estaría
estarías
estaríamos
estaríais
estarían
estaba
estabas
estábamos
estabais
estaban
estuve
estuviste
estuvo
estuvimos
estuvisteis
estuvieron
estuviera
estuvieras
estuviéramos
estuvierais
estuvieran
estuviese
estuvieses
estuviésemos
estuvieseis
estuviesen
estando
estado
estada
estados
estadas
estad
| forms of haber, to have (not including the infinitive):
he
has
ha
hemos
habéis
han
haya
hayas
hayamos
hayáis
hayan
habré
habrás
habrá
habremos
habréis
habrán
habría
habrías
habríamos
habríais
habrían
había
habías
habíamos
habíais
habían
hube
hubiste
hubo
hubimos
hubisteis
hubieron
hubiera
hubieras
hubiéramos
hubierais
hubieran
hubiese
hubieses
hubiésemos
hubieseis
hubiesen
habiendo
habido
habida
habidos
habidas
| forms of ser, to be (not including the infinitive):
soy
eres
es
somos
sois
son
sea
seas
seamos
seáis
sean
seré
serás
será
seremos
seréis
serán
sería
serías
seríamos
seríais
serían
era
eras
éramos
erais
eran
fui
fuiste
fue
fuimos
fuisteis
fueron
fuera
fueras
fuéramos
fuerais
fueran
fuese
fueses
fuésemos
fueseis
fuesen
siendo
sido
| sed also means 'thirst'
| forms of tener, to have (not including the infinitive):
tengo
tienes
tiene
tenemos
tenéis
tienen
tenga
tengas
tengamos
tengáis
tengan
tendré
tendrás
tendrá
tendremos
tendréis
tendrán
tendría
tendrías
tendríamos
tendríais
tendrían
tenía
tenías
teníamos
teníais
tenían
tuve
tuviste
tuvo
tuvimos
tuvisteis
tuvieron
tuviera
tuvieras
tuviéramos
tuvierais
tuvieran
tuviese
tuvieses
tuviésemos
tuvieseis
tuviesen
teniendo
tenido
tenida
tenidos
tenidas
tened

131
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/swedish_stop.txt

@ -0,0 +1,131 @@
| From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
| A Swedish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| This is a ranked list (commonest to rarest) of stopwords derived from
| a large text sample.
| Swedish stop words occasionally exhibit homonym clashes. For example
| så = so, but also seed. These are indicated clearly below.
och | and
det | it, this/that
att | to (with infinitive)
i | in, at
en | a
jag | I
hon | she
som | who, that
han | he
på | on
den | it, this/that
med | with
var | where, each
sig | him(self) etc
för | for
så | so (also: seed)
till | to
är | is
men | but
ett | a
om | if; around, about
hade | had
de | they, these/those
av | of
icke | not, no
mig | me
du | you
henne | her
då | then, when
sin | his
nu | now
har | have
inte | inte någon = no one
hans | his
honom | him
skulle | 'sake'
hennes | her
där | there
min | my
man | one (pronoun)
ej | nor
vid | at, by, on (also: vast)
kunde | could
något | some etc
från | from, off
ut | out
när | when
efter | after, behind
upp | up
vi | we
dem | them
vara | be
vad | what
över | over
än | than
dig | you
kan | can
sina | his
här | here
ha | have
mot | towards
alla | all
under | under (also: wonder)
någon | some etc
eller | or (else)
allt | all
mycket | much
sedan | since
ju | why
denna | this/that
själv | myself, yourself etc
detta | this/that
åt | to
utan | without
varit | was
hur | how
ingen | no
mitt | my
ni | you
bli | to be, become
blev | from bli
oss | us
din | thy
dessa | these/those
några | some etc
deras | their
blir | from bli
mina | my
samma | (the) same
vilken | who, that
er | you, your
sådan | such a
vår | our
blivit | from bli
dess | its
inom | within
mellan | between
sådant | such a
varför | why
varje | each
vilka | who, that
ditt | thy
vem | who
vilket | who, that
sitta | his
sådana | such a
vart | each
dina | thy
vars | whose
vårt | our
våra | our
ert | your
era | your
vilkas | whose

119
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/th/stopwords.txt

@ -0,0 +1,119 @@
# Thai stopwords from:
# "Opinion Detection in Thai Political News Columns
# Based on Subjectivity Analysis"
# Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak
ไว
ไม
ไป
ได
ให
ใน
โดย
แห
แล
และ
แรก
แบบ
แต
เอง
เห
เลย
เร
เรา
เม
เพ
เพราะ
เปนการ
เป
เปดเผย
เป
เนองจาก
เดยวก
เดยว
เช
เฉพาะ
เคย
เข
เขา
อาจ
อะไร
ออก
อยาง
อย
อยาก
หาก
หลาย
หลงจาก
หล
หร
หน
วน
าหร
ลง
วม
ราย
ระหวาง
รวม
มาก
มา
พรอม
พบ
าน
ผล
บาง
นอกจาก
าให
ทาง
งน
อง
างๆ
าง
ตาม
งแต
าน
วย
วง
จาก
จะ
ความ
คร
คง
ของ
ขอ
ขณะ
อน
การ
กว
กลาว

212
fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/tr/stopwords.txt

@ -0,0 +1,212 @@
# Turkish stopwords from LUCENE-559
# merged with the list from "Information Retrieval on Turkish Texts"
# (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf)
acaba
altmış
altı
ama
ancak
arada
aslında
ayrıca
bana
bazı
belki
ben
benden
beni
benim
beri
beş
bile
bin
bir
birçok
biri
birkaç
birkez
birşey
birşeyi
biz
bize
bizden
bizi
bizim
böyle
böylece
bu
buna
bunda
bundan
bunlar
bunları
bunların
bunu
bunun
burada
çok
çünkü
da
daha
dahi
de
defa
değil
diğer
diye
doksan
dokuz
dolayı
dolayısıyla
dört
edecek
eden
ederek
edilecek
ediliyor
edilmesi
ediyor
eğer
elli
en
etmesi
etti
ettiği
ettiğini
gibi
göre
halen
hangi
hatta
hem
henüz
hep
hepsi
her
herhangi
herkesin
hiç
hiçbir
için
iki
ile
ilgili
ise
işte
itibaren
itibariyle
kadar
karşın
katrilyon
kendi
kendilerine
kendini
kendisi
kendisine
kendisini
kez
ki
kim
kimden
kime
kimi
kimse
kırk
milyar
milyon
mu
nasıl
ne
neden
nedenle
nerde
nerede
nereye
niye
niçin
o
olan
olarak
oldu
olduğu
olduğunu
olduklarını
olmadı
olmadığı
olmak
olması
olmayan
olmaz
olsa
olsun
olup
olur
olursa
oluyor
on
ona
ondan
onlar
onlardan
onları
onların
onu
onun
otuz
oysa
öyle
pek
rağmen
sadece
sanki
sekiz
seksen
sen
senden
seni
senin
siz
sizden
sizi
sizin
şey
şeyden
şeyi
şeyler
şöyle
şu
şuna
şunda
şundan
şunları
şunu
tarafından
trilyon
tüm
üç
üzere
var
vardı
ve
veya
ya
yani
yapacak
yapılan
yapılması
yapıyor
yapmak
yaptı
yaptığı
yaptığını
yaptıkları
yedi
yerine
yetmiş
yine
yirmi
yoksa
yüz
zaten

29
fine-lucene/src/com/fr/third/org/apache/lucene/LucenePackage.java

@ -0,0 +1,29 @@
package com.fr.third.org.apache.lucene;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** Lucene's package information, including version. **/
public final class LucenePackage {
private LucenePackage() {} // can't construct
/** Return Lucene's package, including version information. */
public static Package get() {
return LucenePackage.class.getPackage();
}
}

393
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/Analyzer.java

@ -0,0 +1,393 @@
package com.fr.third.org.apache.lucene.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.fr.third.org.apache.lucene.store.AlreadyClosedException;
import com.fr.third.org.apache.lucene.util.CloseableThreadLocal;
import java.io.Closeable;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
/**
* An Analyzer builds TokenStreams, which analyze text. It thus represents a
* policy for extracting index terms from text.
* <p>
* In order to define what analysis is done, subclasses must define their
* {@link TokenStreamComponents TokenStreamComponents} in {@link #createComponents(String, Reader)}.
* The components are then reused in each call to {@link #tokenStream(String, Reader)}.
* <p>
* Simple example:
* <pre class="prettyprint">
* Analyzer analyzer = new Analyzer() {
* {@literal @Override}
* protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
* Tokenizer source = new FooTokenizer(reader);
* TokenStream filter = new FooFilter(source);
* filter = new BarFilter(filter);
* return new TokenStreamComponents(source, filter);
* }
* };
* </pre>
* For more examples, see the {@link com.fr.third.org.apache.lucene.analysis Analysis package documentation}.
* <p>
* For some concrete implementations bundled with Lucene, look in the analysis modules:
* <ul>
* <li><a href="{@docRoot}/../analyzers-common/overview-summary.html">Common</a>:
* Analyzers for indexing content in different languages and domains.
* <li><a href="{@docRoot}/../analyzers-icu/overview-summary.html">ICU</a>:
* Exposes functionality from ICU to Apache Lucene.
* <li><a href="{@docRoot}/../analyzers-kuromoji/overview-summary.html">Kuromoji</a>:
* Morphological analyzer for Japanese text.
* <li><a href="{@docRoot}/../analyzers-morfologik/overview-summary.html">Morfologik</a>:
* Dictionary-driven lemmatization for the Polish language.
* <li><a href="{@docRoot}/../analyzers-phonetic/overview-summary.html">Phonetic</a>:
* Analysis for indexing phonetic signatures (for sounds-alike search).
* <li><a href="{@docRoot}/../analyzers-smartcn/overview-summary.html">Smart Chinese</a>:
* Analyzer for Simplified Chinese, which indexes words.
* <li><a href="{@docRoot}/../analyzers-stempel/overview-summary.html">Stempel</a>:
* Algorithmic Stemmer for the Polish Language.
* <li><a href="{@docRoot}/../analyzers-uima/overview-summary.html">UIMA</a>:
* Analysis integration with Apache UIMA.
* </ul>
*/
public abstract class Analyzer implements Closeable {
private final ReuseStrategy reuseStrategy;
/**
* Create a new Analyzer, reusing the same set of components per-thread
* across calls to {@link #tokenStream(String, Reader)}.
*/
public Analyzer() {
this(new GlobalReuseStrategy());
}
/**
* Expert: create a new Analyzer with a custom {@link ReuseStrategy}.
* <p>
* NOTE: if you just want to reuse on a per-field basis, its easier to
* use a subclass of {@link AnalyzerWrapper} such as
* <a href="{@docRoot}/../analyzers-common/org/apache/lucene/analysis/miscellaneous/PerFieldAnalyzerWrapper.html">
* PerFieldAnalyerWrapper</a> instead.
*/
public Analyzer(ReuseStrategy reuseStrategy) {
this.reuseStrategy = reuseStrategy;
}
/**
* Creates a new {@link TokenStreamComponents} instance for this analyzer.
*
* @param fieldName
* the name of the fields content passed to the
* {@link TokenStreamComponents} sink as a reader
* @param reader
* the reader passed to the {@link Tokenizer} constructor
* @return the {@link TokenStreamComponents} for this analyzer.
*/
protected abstract TokenStreamComponents createComponents(String fieldName,
Reader reader);
/**
* Returns a TokenStream suitable for <code>fieldName</code>, tokenizing
* the contents of <code>reader</code>.
* <p>
* This method uses {@link #createComponents(String, Reader)} to obtain an
* instance of {@link TokenStreamComponents}. It returns the sink of the
* components and stores the components internally. Subsequent calls to this
* method will reuse the previously stored components after resetting them
* through {@link TokenStreamComponents#setReader(Reader)}.
* <p>
* <b>NOTE:</b> After calling this method, the consumer must follow the
* workflow described in {@link TokenStream} to properly consume its contents.
* See the {@link com.fr.third.org.apache.lucene.analysis Analysis package documentation} for
* some examples demonstrating this.
*
* @param fieldName the name of the field the created TokenStream is used for
* @param reader the reader the streams source reads from
* @return TokenStream for iterating the analyzed content of <code>reader</code>
* @throws AlreadyClosedException if the Analyzer is closed.
* @throws IOException if an i/o error occurs.
*/
public final TokenStream tokenStream(final String fieldName,
final Reader reader) throws IOException {
TokenStreamComponents components = reuseStrategy.getReusableComponents(fieldName);
final Reader r = initReader(fieldName, reader);
if (components == null) {
components = createComponents(fieldName, r);
reuseStrategy.setReusableComponents(fieldName, components);
} else {
components.setReader(r);
}
return components.getTokenStream();
}
/**
* Override this if you want to add a CharFilter chain.
* <p>
* The default implementation returns <code>reader</code>
* unchanged.
*
* @param fieldName IndexableField name being indexed
* @param reader original Reader
* @return reader, optionally decorated with CharFilter(s)
*/
protected Reader initReader(String fieldName, Reader reader) {
return reader;
}
/**
* Invoked before indexing a IndexableField instance if
* terms have already been added to that field. This allows custom
* analyzers to place an automatic position increment gap between
* IndexbleField instances using the same field name. The default value
* position increment gap is 0. With a 0 position increment gap and
* the typical default token position increment of 1, all terms in a field,
* including across IndexableField instances, are in successive positions, allowing
* exact PhraseQuery matches, for instance, across IndexableField instance boundaries.
*
* @param fieldName IndexableField name being indexed.
* @return position increment gap, added to the next token emitted from {@link #tokenStream(String,Reader)}.
* This value must be {@code >= 0}.
*/
public int getPositionIncrementGap(String fieldName) {
return 0;
}
/**
* Just like {@link #getPositionIncrementGap}, except for
* Token offsets instead. By default this returns 1.
* This method is only called if the field
* produced at least one token for indexing.
*
* @param fieldName the field just indexed
* @return offset gap, added to the next token emitted from {@link #tokenStream(String,Reader)}.
* This value must be {@code >= 0}.
*/
public int getOffsetGap(String fieldName) {
return 1;
}
/** Frees persistent resources used by this Analyzer */
public void close() {
reuseStrategy.close();
}
/**
* This class encapsulates the outer components of a token stream. It provides
* access to the source ({@link Tokenizer}) and the outer end (sink), an
* instance of {@link TokenFilter} which also serves as the
* {@link TokenStream} returned by
* {@link Analyzer#tokenStream(String, Reader)}.
*/
public static class TokenStreamComponents {
/**
* Original source of the tokens.
*/
protected final Tokenizer source;
/**
* Sink tokenstream, such as the outer tokenfilter decorating
* the chain. This can be the source if there are no filters.
*/
protected final TokenStream sink;
/**
* Creates a new {@link TokenStreamComponents} instance.
*
* @param source
* the analyzer's tokenizer
* @param result
* the analyzer's resulting token stream
*/
public TokenStreamComponents(final Tokenizer source,
final TokenStream result) {
this.source = source;
this.sink = result;
}
/**
* Creates a new {@link TokenStreamComponents} instance.
*
* @param source
* the analyzer's tokenizer
*/
public TokenStreamComponents(final Tokenizer source) {
this.source = source;
this.sink = source;
}
/**
* Resets the encapsulated components with the given reader. If the components
* cannot be reset, an Exception should be thrown.
*
* @param reader
* a reader to reset the source component
* @throws IOException
* if the component's reset method throws an {@link IOException}
*/
protected void setReader(final Reader reader) throws IOException {
source.setReader(reader);
}
/**
* Returns the sink {@link TokenStream}
*
* @return the sink {@link TokenStream}
*/
public TokenStream getTokenStream() {
return sink;
}
/**
* Returns the component's {@link Tokenizer}
*
* @return Component's {@link Tokenizer}
*/
public Tokenizer getTokenizer() {
return source;
}
}
/**
* Strategy defining how TokenStreamComponents are reused per call to
* {@link Analyzer#tokenStream(String, Reader)}.
*/
public static abstract class ReuseStrategy implements Closeable {
private CloseableThreadLocal<Object> storedValue = new CloseableThreadLocal<Object>();
/** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
public ReuseStrategy() {}
/**
* Gets the reusable TokenStreamComponents for the field with the given name
*
* @param fieldName Name of the field whose reusable TokenStreamComponents
* are to be retrieved
* @return Reusable TokenStreamComponents for the field, or {@code null}
* if there was no previous components for the field
*/
public abstract TokenStreamComponents getReusableComponents(String fieldName);
/**
* Stores the given TokenStreamComponents as the reusable components for the
* field with the give name
*
* @param fieldName Name of the field whose TokenStreamComponents are being set
* @param components TokenStreamComponents which are to be reused for the field
*/
public abstract void setReusableComponents(String fieldName, TokenStreamComponents components);
/**
* Returns the currently stored value
*
* @return Currently stored value or {@code null} if no value is stored
* @throws AlreadyClosedException if the ReuseStrategy is closed.
*/
protected final Object getStoredValue() {
try {
return storedValue.get();
} catch (NullPointerException npe) {
if (storedValue == null) {
throw new AlreadyClosedException("this Analyzer is closed");
} else {
throw npe;
}
}
}
/**
* Sets the stored value
*
* @param storedValue Value to store
* @throws AlreadyClosedException if the ReuseStrategy is closed.
*/
protected final void setStoredValue(Object storedValue) {
try {
this.storedValue.set(storedValue);
} catch (NullPointerException npe) {
if (storedValue == null) {
throw new AlreadyClosedException("this Analyzer is closed");
} else {
throw npe;
}
}
}
/**
* Closes the ReuseStrategy, freeing any resources
*/
public void close() {
if (storedValue != null) {
storedValue.close();
storedValue = null;
}
}
}
/**
* Implementation of {@link ReuseStrategy} that reuses the same components for
* every field.
*/
public final static class GlobalReuseStrategy extends ReuseStrategy {
/** Creates a new instance, with empty per-thread values */
public GlobalReuseStrategy() {}
@Override
public TokenStreamComponents getReusableComponents(String fieldName) {
return (TokenStreamComponents) getStoredValue();
}
@Override
public void setReusableComponents(String fieldName, TokenStreamComponents components) {
setStoredValue(components);
}
}
/**
* Implementation of {@link ReuseStrategy} that reuses components per-field by
* maintaining a Map of TokenStreamComponent per field name.
*/
public static class PerFieldReuseStrategy extends ReuseStrategy {
/** Creates a new instance, with empty per-thread-per-field values */
public PerFieldReuseStrategy() {}
@SuppressWarnings("unchecked")
@Override
public TokenStreamComponents getReusableComponents(String fieldName) {
Map<String, TokenStreamComponents> componentsPerField = (Map<String, TokenStreamComponents>) getStoredValue();
return componentsPerField != null ? componentsPerField.get(fieldName) : null;
}
@SuppressWarnings("unchecked")
@Override
public void setReusableComponents(String fieldName, TokenStreamComponents components) {
Map<String, TokenStreamComponents> componentsPerField = (Map<String, TokenStreamComponents>) getStoredValue();
if (componentsPerField == null) {
componentsPerField = new HashMap<String, TokenStreamComponents>();
setStoredValue(componentsPerField);
}
componentsPerField.put(fieldName, components);
}
}
}

83
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/AnalyzerWrapper.java

@ -0,0 +1,83 @@
package com.fr.third.org.apache.lucene.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
/**
* Extension to {@link Analyzer} suitable for Analyzers which wrap
* other Analyzers.
* <p/>
* {@link #getWrappedAnalyzer(String)} allows the Analyzer
* to wrap multiple Analyzers which are selected on a per field basis.
* <p/>
* {@link #wrapComponents(String, TokenStreamComponents)} allows the
* TokenStreamComponents of the wrapped Analyzer to then be wrapped
* (such as adding a new {@link TokenFilter} to form new TokenStreamComponents.
*/
public abstract class AnalyzerWrapper extends Analyzer {
/**
* Creates a new AnalyzerWrapper. Since the {@link ReuseStrategy} of
* the wrapped Analyzers are unknown, {@link PerFieldReuseStrategy} is assumed
*/
protected AnalyzerWrapper() {
super(new PerFieldReuseStrategy());
}
/**
* Retrieves the wrapped Analyzer appropriate for analyzing the field with
* the given name
*
* @param fieldName Name of the field which is to be analyzed
* @return Analyzer for the field with the given name. Assumed to be non-null
*/
protected abstract Analyzer getWrappedAnalyzer(String fieldName);
/**
* Wraps / alters the given TokenStreamComponents, taken from the wrapped
* Analyzer, to form new components. It is through this method that new
* TokenFilters can be added by AnalyzerWrappers.
*
*
* @param fieldName Name of the field which is to be analyzed
* @param components TokenStreamComponents taken from the wrapped Analyzer
* @return Wrapped / altered TokenStreamComponents.
*/
protected abstract TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components);
@Override
protected final TokenStreamComponents createComponents(String fieldName, Reader aReader) {
return wrapComponents(fieldName, getWrappedAnalyzer(fieldName).createComponents(fieldName, aReader));
}
@Override
public final int getPositionIncrementGap(String fieldName) {
return getWrappedAnalyzer(fieldName).getPositionIncrementGap(fieldName);
}
@Override
public final int getOffsetGap(String fieldName) {
return getWrappedAnalyzer(fieldName).getOffsetGap(fieldName);
}
@Override
public final Reader initReader(String fieldName, Reader reader) {
return getWrappedAnalyzer(fieldName).initReader(fieldName, reader);
}
}

98
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/CachingTokenFilter.java

@ -0,0 +1,98 @@
package com.fr.third.org.apache.lucene.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import com.fr.third.org.apache.lucene.util.AttributeSource;
/**
* This class can be used if the token attributes of a TokenStream
* are intended to be consumed more than once. It caches
* all token attribute states locally in a List.
*
* <P>CachingTokenFilter implements the optional method
* {@link TokenStream#reset()}, which repositions the
* stream to the first Token.
*/
public final class CachingTokenFilter extends TokenFilter {
private List<State> cache = null;
private Iterator<State> iterator = null;
private State finalState;
/**
* Create a new CachingTokenFilter around <code>input</code>,
* caching its token attributes, which can be replayed again
* after a call to {@link #reset()}.
*/
public CachingTokenFilter(TokenStream input) {
super(input);
}
@Override
public final boolean incrementToken() throws IOException {
if (cache == null) {
// fill cache lazily
cache = new LinkedList<State>();
fillCache();
iterator = cache.iterator();
}
if (!iterator.hasNext()) {
// the cache is exhausted, return false
return false;
}
// Since the TokenFilter can be reset, the tokens need to be preserved as immutable.
restoreState(iterator.next());
return true;
}
@Override
public final void end() {
if (finalState != null) {
restoreState(finalState);
}
}
/**
* Rewinds the iterator to the beginning of the cached list.
* <p>
* Note that this does not call reset() on the wrapped tokenstream ever, even
* the first time. You should reset() the inner tokenstream before wrapping
* it with CachingTokenFilter.
*/
@Override
public void reset() {
if(cache != null) {
iterator = cache.iterator();
}
}
private void fillCache() throws IOException {
while(input.incrementToken()) {
cache.add(captureState());
}
// capture final state
input.end();
finalState = captureState();
}
}

84
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/CharFilter.java

@ -0,0 +1,84 @@
package com.fr.third.org.apache.lucene.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
/**
* Subclasses of CharFilter can be chained to filter a Reader
* They can be used as {@link Reader} with additional offset
* correction. {@link Tokenizer}s will automatically use {@link #correctOffset}
* if a CharFilter subclass is used.
* <p>
* This class is abstract: at a minimum you must implement {@link #read(char[], int, int)},
* transforming the input in some way from {@link #input}, and {@link #correct(int)}
* to adjust the offsets to match the originals.
* <p>
* You can optionally provide more efficient implementations of additional methods
* like {@link #read()}, {@link #read(char[])}, {@link #read(java.nio.CharBuffer)},
* but this is not required.
* <p>
* For examples and integration with {@link Analyzer}, see the
* {@link com.fr.third.org.apache.lucene.analysis Analysis package documentation}.
*/
// the way java.io.FilterReader should work!
public abstract class CharFilter extends Reader {
/**
* The underlying character-input stream.
*/
protected final Reader input;
/**
* Create a new CharFilter wrapping the provided reader.
* @param input a Reader, can also be a CharFilter for chaining.
*/
public CharFilter(Reader input) {
super(input);
this.input = input;
}
/**
* Closes the underlying input stream.
* <p>
* <b>NOTE:</b>
* The default implementation closes the input Reader, so
* be sure to call <code>super.close()</code> when overriding this method.
*/
@Override
public void close() throws IOException {
input.close();
}
/**
* Subclasses override to correct the current offset.
*
* @param currentOff current offset
* @return corrected offset
*/
protected abstract int correct(int currentOff);
/**
* Chains the corrected offset through the input
* CharFilter(s).
*/
public final int correctOffset(int currentOff) {
final int corrected = correct(currentOff);
return (input instanceof CharFilter) ? ((CharFilter) input).correctOffset(corrected) : corrected;
}
}

321
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/NumericTokenStream.java

@ -0,0 +1,321 @@
package com.fr.third.org.apache.lucene.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import com.fr.third.org.apache.lucene.document.DoubleField; // for javadocs
import com.fr.third.org.apache.lucene.document.FloatField; // for javadocs
import com.fr.third.org.apache.lucene.document.IntField; // for javadocs
import com.fr.third.org.apache.lucene.document.LongField; // for javadocs
import com.fr.third.org.apache.lucene.search.NumericRangeFilter; // for javadocs
import com.fr.third.org.apache.lucene.search.NumericRangeQuery;
import com.fr.third.org.apache.lucene.util.Attribute;
import com.fr.third.org.apache.lucene.util.AttributeImpl;
import com.fr.third.org.apache.lucene.util.AttributeReflector;
import com.fr.third.org.apache.lucene.util.BytesRef;
import com.fr.third.org.apache.lucene.util.NumericUtils;
/**
* <b>Expert:</b> This class provides a {@link TokenStream}
* for indexing numeric values that can be used by {@link
* NumericRangeQuery} or {@link NumericRangeFilter}.
*
* <p>Note that for simple usage, {@link IntField}, {@link
* LongField}, {@link FloatField} or {@link DoubleField} is
* recommended. These fields disable norms and
* term freqs, as they are not usually needed during
* searching. If you need to change these settings, you
* should use this class.
*
* <p>Here's an example usage, for an <code>int</code> field:
*
* <pre class="prettyprint">
* FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
* fieldType.setOmitNorms(true);
* fieldType.setIndexOptions(IndexOptions.DOCS_ONLY);
* Field field = new Field(name, new NumericTokenStream(precisionStep).setIntValue(value), fieldType);
* document.add(field);
* </pre>
*
* <p>For optimal performance, re-use the TokenStream and Field instance
* for more than one document:
*
* <pre class="prettyprint">
* NumericTokenStream stream = new NumericTokenStream(precisionStep);
* FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
* fieldType.setOmitNorms(true);
* fieldType.setIndexOptions(IndexOptions.DOCS_ONLY);
* Field field = new Field(name, stream, fieldType);
* Document document = new Document();
* document.add(field);
*
* for(all documents) {
* stream.setIntValue(value)
* writer.addDocument(document);
* }
* </pre>
*
* <p>This stream is not intended to be used in analyzers;
* it's more for iterating the different precisions during
* indexing a specific numeric value.</p>
* <p><b>NOTE</b>: as token streams are only consumed once
* the document is added to the index, if you index more
* than one numeric field, use a separate <code>NumericTokenStream</code>
* instance for each.</p>
*
* <p>See {@link NumericRangeQuery} for more details on the
* <a
* href="../search/NumericRangeQuery.html#precisionStepDesc"><code>precisionStep</code></a>
* parameter as well as how numeric fields work under the hood.</p>
*
* @since 2.9
*/
public final class NumericTokenStream extends TokenStream {
/** The full precision token gets this token type assigned. */
public static final String TOKEN_TYPE_FULL_PREC = "fullPrecNumeric";
/** The lower precision tokens gets this token type assigned. */
public static final String TOKEN_TYPE_LOWER_PREC = "lowerPrecNumeric";
/** <b>Expert:</b> Use this attribute to get the details of the currently generated token.
* @lucene.experimental
* @since 4.0
*/
public interface NumericTermAttribute extends Attribute {
/** Returns current shift value, undefined before first token */
int getShift();
/** Returns current token's raw value as {@code long} with all {@link #getShift} applied, undefined before first token */
long getRawValue();
/** Returns value size in bits (32 for {@code float}, {@code int}; 64 for {@code double}, {@code long}) */
int getValueSize();
/** <em>Don't call this method!</em>
* @lucene.internal */
void init(long value, int valSize, int precisionStep, int shift);
/** <em>Don't call this method!</em>
* @lucene.internal */
void setShift(int shift);
/** <em>Don't call this method!</em>
* @lucene.internal */
int incShift();
}
// just a wrapper to prevent adding CTA
private static final class NumericAttributeFactory extends AttributeFactory {
private final AttributeFactory delegate;
NumericAttributeFactory(AttributeFactory delegate) {
this.delegate = delegate;
}
@Override
public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
if (CharTermAttribute.class.isAssignableFrom(attClass))
throw new IllegalArgumentException("NumericTokenStream does not support CharTermAttribute.");
return delegate.createAttributeInstance(attClass);
}
}
/** Implementation of {@link NumericTermAttribute}.
* @lucene.internal
* @since 4.0
*/
public static final class NumericTermAttributeImpl extends AttributeImpl implements NumericTermAttribute,TermToBytesRefAttribute {
private long value = 0L;
private int valueSize = 0, shift = 0, precisionStep = 0;
private BytesRef bytes = new BytesRef();
/**
* Creates, but does not yet initialize this attribute instance
* @see #init(long, int, int, int)
*/
public NumericTermAttributeImpl() {}
public BytesRef getBytesRef() {
return bytes;
}
public int fillBytesRef() {
try {
assert valueSize == 64 || valueSize == 32;
return (valueSize == 64) ?
NumericUtils.longToPrefixCoded(value, shift, bytes) :
NumericUtils.intToPrefixCoded((int) value, shift, bytes);
} catch (IllegalArgumentException iae) {
// return empty token before first or after last
bytes.length = 0;
return 0;
}
}
public int getShift() { return shift; }
public void setShift(int shift) { this.shift = shift; }
public int incShift() {
return (shift += precisionStep);
}
public long getRawValue() { return value & ~((1L << shift) - 1L); }
public int getValueSize() { return valueSize; }
public void init(long value, int valueSize, int precisionStep, int shift) {
this.value = value;
this.valueSize = valueSize;
this.precisionStep = precisionStep;
this.shift = shift;
}
@Override
public void clear() {
// this attribute has no contents to clear!
// we keep it untouched as it's fully controlled by outer class.
}
@Override
public void reflectWith(AttributeReflector reflector) {
fillBytesRef();
reflector.reflect(TermToBytesRefAttribute.class, "bytes", BytesRef.deepCopyOf(bytes));
reflector.reflect(NumericTermAttribute.class, "shift", shift);
reflector.reflect(NumericTermAttribute.class, "rawValue", getRawValue());
reflector.reflect(NumericTermAttribute.class, "valueSize", valueSize);
}
@Override
public void copyTo(AttributeImpl target) {
final NumericTermAttribute a = (NumericTermAttribute) target;
a.init(value, valueSize, precisionStep, shift);
}
}
/**
* Creates a token stream for numeric values using the default <code>precisionStep</code>
* {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). The stream is not yet initialized,
* before using set a value using the various set<em>???</em>Value() methods.
*/
public NumericTokenStream() {
this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, NumericUtils.PRECISION_STEP_DEFAULT);
}
/**
* Creates a token stream for numeric values with the specified
* <code>precisionStep</code>. The stream is not yet initialized,
* before using set a value using the various set<em>???</em>Value() methods.
*/
public NumericTokenStream(final int precisionStep) {
this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, precisionStep);
}
/**
* Expert: Creates a token stream for numeric values with the specified
* <code>precisionStep</code> using the given
* {@link AttributeFactory}.
* The stream is not yet initialized,
* before using set a value using the various set<em>???</em>Value() methods.
*/
public NumericTokenStream(AttributeFactory factory, final int precisionStep) {
super(new NumericAttributeFactory(factory));
if (precisionStep < 1)
throw new IllegalArgumentException("precisionStep must be >=1");
this.precisionStep = precisionStep;
numericAtt.setShift(-precisionStep);
}
/**
* Initializes the token stream with the supplied <code>long</code> value.
* @param value the value, for which this TokenStream should enumerate tokens.
* @return this instance, because of this you can use it the following way:
* <code>new Field(name, new NumericTokenStream(precisionStep).setLongValue(value))</code>
*/
public NumericTokenStream setLongValue(final long value) {
numericAtt.init(value, valSize = 64, precisionStep, -precisionStep);
return this;
}
/**
* Initializes the token stream with the supplied <code>int</code> value.
* @param value the value, for which this TokenStream should enumerate tokens.
* @return this instance, because of this you can use it the following way:
* <code>new Field(name, new NumericTokenStream(precisionStep).setIntValue(value))</code>
*/
public NumericTokenStream setIntValue(final int value) {
numericAtt.init(value, valSize = 32, precisionStep, -precisionStep);
return this;
}
/**
* Initializes the token stream with the supplied <code>double</code> value.
* @param value the value, for which this TokenStream should enumerate tokens.
* @return this instance, because of this you can use it the following way:
* <code>new Field(name, new NumericTokenStream(precisionStep).setDoubleValue(value))</code>
*/
public NumericTokenStream setDoubleValue(final double value) {
numericAtt.init(NumericUtils.doubleToSortableLong(value), valSize = 64, precisionStep, -precisionStep);
return this;
}
/**
* Initializes the token stream with the supplied <code>float</code> value.
* @param value the value, for which this TokenStream should enumerate tokens.
* @return this instance, because of this you can use it the following way:
* <code>new Field(name, new NumericTokenStream(precisionStep).setFloatValue(value))</code>
*/
public NumericTokenStream setFloatValue(final float value) {
numericAtt.init(NumericUtils.floatToSortableInt(value), valSize = 32, precisionStep, -precisionStep);
return this;
}
@Override
public void reset() {
if (valSize == 0)
throw new IllegalStateException("call set???Value() before usage");
numericAtt.setShift(-precisionStep);
}
@Override
public boolean incrementToken() {
if (valSize == 0)
throw new IllegalStateException("call set???Value() before usage");
// this will only clear all other attributes in this TokenStream
clearAttributes();
final int shift = numericAtt.incShift();
typeAtt.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC : TOKEN_TYPE_LOWER_PREC);
posIncrAtt.setPositionIncrement((shift == 0) ? 1 : 0);
return (shift < valSize);
}
/** Returns the precision step. */
public int getPrecisionStep() {
return precisionStep;
}
// members
private final NumericTermAttribute numericAtt = addAttribute(NumericTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private int valSize = 0; // valSize==0 means not initialized
private final int precisionStep;
}

651
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/Token.java

@ -0,0 +1,651 @@
package com.fr.third.org.apache.lucene.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import com.fr.third.org.apache.lucene.index.DocsAndPositionsEnum; // for javadoc
import com.fr.third.org.apache.lucene.util.Attribute;
import com.fr.third.org.apache.lucene.util.AttributeSource;
import com.fr.third.org.apache.lucene.util.AttributeImpl;
import com.fr.third.org.apache.lucene.util.AttributeReflector;
import com.fr.third.org.apache.lucene.util.BytesRef;
/**
A Token is an occurrence of a term from the text of a field. It consists of
a term's text, the start and end offset of the term in the text of the field,
and a type string.
<p>
The start and end offsets permit applications to re-associate a token with
its source text, e.g., to display highlighted query terms in a document
browser, or to show matching text fragments in a <abbr title="KeyWord In Context">KWIC</abbr>
display, etc.
<p>
The type is a string, assigned by a lexical analyzer
(a.k.a. tokenizer), naming the lexical or syntactic class that the token
belongs to. For example an end of sentence marker token might be implemented
with type "eos". The default token type is "word".
<p>
A Token can optionally have metadata (a.k.a. payload) in the form of a variable
length byte array. Use {@link DocsAndPositionsEnum#getPayload()} to retrieve the
payloads from the index.
<br><br>
<p><b>NOTE:</b> As of 2.9, Token implements all {@link Attribute} interfaces
that are part of core Lucene and can be found in the {@code tokenattributes} subpackage.
Even though it is not necessary to use Token anymore, with the new TokenStream API it can
be used as convenience class that implements all {@link Attribute}s, which is especially useful
to easily switch from the old to the new TokenStream API.
<br><br>
<p>Tokenizers and TokenFilters should try to re-use a Token
instance when possible for best performance, by
implementing the {@link TokenStream#incrementToken()} API.
Failing that, to create a new Token you should first use
one of the constructors that starts with null text. To load
the token from a char[] use {@link #copyBuffer(char[], int, int)}.
To load from a String use {@link #setEmpty} followed by {@link #append(CharSequence)} or {@link #append(CharSequence, int, int)}.
Alternatively you can get the Token's termBuffer by calling either {@link #buffer()},
if you know that your text is shorter than the capacity of the termBuffer
or {@link #resizeBuffer(int)}, if there is any possibility
that you may need to grow the buffer. Fill in the characters of your term into this
buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string,
or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setLength(int)} to
set the length of the term text. See <a target="_top"
href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a>
for details.</p>
<p>Typical Token reuse patterns:
<ul>
<li> Copying text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
<pre class="prettyprint">
return reusableToken.reinit(string, startOffset, endOffset[, type]);
</pre>
</li>
<li> Copying some text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
<pre class="prettyprint">
return reusableToken.reinit(string, 0, string.length(), startOffset, endOffset[, type]);
</pre>
</li>
</li>
<li> Copying text from char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
<pre class="prettyprint">
return reusableToken.reinit(buffer, 0, buffer.length, startOffset, endOffset[, type]);
</pre>
</li>
<li> Copying some text from a char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
<pre class="prettyprint">
return reusableToken.reinit(buffer, start, end - start, startOffset, endOffset[, type]);
</pre>
</li>
<li> Copying from one one Token to another (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
<pre class="prettyprint">
return reusableToken.reinit(source.buffer(), 0, source.length(), source.startOffset(), source.endOffset()[, source.type()]);
</pre>
</li>
</ul>
A few things to note:
<ul>
<li>clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but should affect no one.</li>
<li>Because <code>TokenStreams</code> can be chained, one cannot assume that the <code>Token's</code> current type is correct.</li>
<li>The startOffset and endOffset represent the start and offset in the source text, so be careful in adjusting them.</li>
<li>When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.</li>
</ul>
</p>
<p>
<b>Please note:</b> With Lucene 3.1, the <code>{@linkplain #toString toString()}</code> method had to be changed to match the
{@link CharSequence} interface introduced by the interface {@link com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute}.
This method now only prints the term text, no additional information anymore.
</p>
*/
public class Token extends CharTermAttributeImpl
implements TypeAttribute, PositionIncrementAttribute,
FlagsAttribute, OffsetAttribute, PayloadAttribute, PositionLengthAttribute {
private int startOffset,endOffset;
private String type = DEFAULT_TYPE;
private int flags;
private BytesRef payload;
private int positionIncrement = 1;
private int positionLength = 1;
/** Constructs a Token will null text. */
public Token() {
}
/** Constructs a Token with null text and start & end
* offsets.
* @param start start offset in the source text
* @param end end offset in the source text */
public Token(int start, int end) {
checkOffsets(start, end);
startOffset = start;
endOffset = end;
}
/** Constructs a Token with null text and start & end
* offsets plus the Token type.
* @param start start offset in the source text
* @param end end offset in the source text
* @param typ the lexical type of this Token */
public Token(int start, int end, String typ) {
checkOffsets(start, end);
startOffset = start;
endOffset = end;
type = typ;
}
/**
* Constructs a Token with null text and start & end
* offsets plus flags. NOTE: flags is EXPERIMENTAL.
* @param start start offset in the source text
* @param end end offset in the source text
* @param flags The bits to set for this token
*/
public Token(int start, int end, int flags) {
checkOffsets(start, end);
startOffset = start;
endOffset = end;
this.flags = flags;
}
/** Constructs a Token with the given term text, and start
* & end offsets. The type defaults to "word."
* <b>NOTE:</b> for better indexing speed you should
* instead use the char[] termBuffer methods to set the
* term text.
* @param text term text
* @param start start offset in the source text
* @param end end offset in the source text
*/
public Token(String text, int start, int end) {
checkOffsets(start, end);
append(text);
startOffset = start;
endOffset = end;
}
/** Constructs a Token with the given text, start and end
* offsets, & type. <b>NOTE:</b> for better indexing
* speed you should instead use the char[] termBuffer
* methods to set the term text.
* @param text term text
* @param start start offset in the source text
* @param end end offset in the source text
* @param typ token type
*/
public Token(String text, int start, int end, String typ) {
checkOffsets(start, end);
append(text);
startOffset = start;
endOffset = end;
type = typ;
}
/**
* Constructs a Token with the given text, start and end
* offsets, & type. <b>NOTE:</b> for better indexing
* speed you should instead use the char[] termBuffer
* methods to set the term text.
* @param text term text
* @param start start offset in the source text
* @param end end offset in the source text
* @param flags token type bits
*/
public Token(String text, int start, int end, int flags) {
checkOffsets(start, end);
append(text);
startOffset = start;
endOffset = end;
this.flags = flags;
}
/**
* Constructs a Token with the given term buffer (offset
* & length), start and end
* offsets
* @param startTermBuffer buffer containing term text
* @param termBufferOffset the index in the buffer of the first character
* @param termBufferLength number of valid characters in the buffer
* @param start start offset in the source text
* @param end end offset in the source text
*/
public Token(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) {
checkOffsets(start, end);
copyBuffer(startTermBuffer, termBufferOffset, termBufferLength);
startOffset = start;
endOffset = end;
}
/**
* {@inheritDoc}
* @see PositionIncrementAttribute
*/
public void setPositionIncrement(int positionIncrement) {
if (positionIncrement < 0)
throw new IllegalArgumentException
("Increment must be zero or greater: " + positionIncrement);
this.positionIncrement = positionIncrement;
}
/**
* {@inheritDoc}
* @see PositionIncrementAttribute
*/
public int getPositionIncrement() {
return positionIncrement;
}
/**
* {@inheritDoc}
* @see PositionLengthAttribute
*/
@Override
public void setPositionLength(int positionLength) {
this.positionLength = positionLength;
}
/**
* {@inheritDoc}
* @see PositionLengthAttribute
*/
@Override
public int getPositionLength() {
return positionLength;
}
/**
* {@inheritDoc}
* @see OffsetAttribute
*/
public final int startOffset() {
return startOffset;
}
/**
* {@inheritDoc}
* @see OffsetAttribute
*/
public final int endOffset() {
return endOffset;
}
/**
* {@inheritDoc}
* @see OffsetAttribute
*/
public void setOffset(int startOffset, int endOffset) {
checkOffsets(startOffset, endOffset);
this.startOffset = startOffset;
this.endOffset = endOffset;
}
/**
* {@inheritDoc}
* @see TypeAttribute
*/
public final String type() {
return type;
}
/**
* {@inheritDoc}
* @see TypeAttribute
*/
public final void setType(String type) {
this.type = type;
}
/**
* {@inheritDoc}
* @see FlagsAttribute
*/
public int getFlags() {
return flags;
}
/**
* {@inheritDoc}
* @see FlagsAttribute
*/
public void setFlags(int flags) {
this.flags = flags;
}
/**
* {@inheritDoc}
* @see PayloadAttribute
*/
public BytesRef getPayload() {
return this.payload;
}
/**
* {@inheritDoc}
* @see PayloadAttribute
*/
public void setPayload(BytesRef payload) {
this.payload = payload;
}
/** Resets the term text, payload, flags, and positionIncrement,
* startOffset, endOffset and token type to default.
*/
@Override
public void clear() {
super.clear();
payload = null;
positionIncrement = 1;
flags = 0;
startOffset = endOffset = 0;
type = DEFAULT_TYPE;
}
@Override
public Token clone() {
Token t = (Token)super.clone();
// Do a deep clone
if (payload != null) {
t.payload = payload.clone();
}
return t;
}
/** Makes a clone, but replaces the term buffer &
* start/end offset in the process. This is more
* efficient than doing a full clone (and then calling
* {@link #copyBuffer}) because it saves a wasted copy of the old
* termBuffer. */
public Token clone(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
final Token t = new Token(newTermBuffer, newTermOffset, newTermLength, newStartOffset, newEndOffset);
t.positionIncrement = positionIncrement;
t.flags = flags;
t.type = type;
if (payload != null)
t.payload = payload.clone();
return t;
}
@Override
public boolean equals(Object obj) {
if (obj == this)
return true;
if (obj instanceof Token) {
final Token other = (Token) obj;
return (startOffset == other.startOffset &&
endOffset == other.endOffset &&
flags == other.flags &&
positionIncrement == other.positionIncrement &&
(type == null ? other.type == null : type.equals(other.type)) &&
(payload == null ? other.payload == null : payload.equals(other.payload)) &&
super.equals(obj)
);
} else
return false;
}
@Override
public int hashCode() {
int code = super.hashCode();
code = code * 31 + startOffset;
code = code * 31 + endOffset;
code = code * 31 + flags;
code = code * 31 + positionIncrement;
if (type != null)
code = code * 31 + type.hashCode();
if (payload != null)
code = code * 31 + payload.hashCode();
return code;
}
// like clear() but doesn't clear termBuffer/text
private void clearNoTermBuffer() {
payload = null;
positionIncrement = 1;
flags = 0;
startOffset = endOffset = 0;
type = DEFAULT_TYPE;
}
/** Shorthand for calling {@link #clear},
* {@link #copyBuffer(char[], int, int)},
* {@link #setOffset},
* {@link #setType}
* @return this Token instance */
public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
checkOffsets(newStartOffset, newEndOffset);
clearNoTermBuffer();
copyBuffer(newTermBuffer, newTermOffset, newTermLength);
payload = null;
positionIncrement = 1;
startOffset = newStartOffset;
endOffset = newEndOffset;
type = newType;
return this;
}
/** Shorthand for calling {@link #clear},
* {@link #copyBuffer(char[], int, int)},
* {@link #setOffset},
* {@link #setType} on Token.DEFAULT_TYPE
* @return this Token instance */
public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
checkOffsets(newStartOffset, newEndOffset);
clearNoTermBuffer();
copyBuffer(newTermBuffer, newTermOffset, newTermLength);
startOffset = newStartOffset;
endOffset = newEndOffset;
type = DEFAULT_TYPE;
return this;
}
/** Shorthand for calling {@link #clear},
* {@link #append(CharSequence)},
* {@link #setOffset},
* {@link #setType}
* @return this Token instance */
public Token reinit(String newTerm, int newStartOffset, int newEndOffset, String newType) {
checkOffsets(newStartOffset, newEndOffset);
clear();
append(newTerm);
startOffset = newStartOffset;
endOffset = newEndOffset;
type = newType;
return this;
}
/** Shorthand for calling {@link #clear},
* {@link #append(CharSequence, int, int)},
* {@link #setOffset},
* {@link #setType}
* @return this Token instance */
public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
checkOffsets(newStartOffset, newEndOffset);
clear();
append(newTerm, newTermOffset, newTermOffset + newTermLength);
startOffset = newStartOffset;
endOffset = newEndOffset;
type = newType;
return this;
}
/** Shorthand for calling {@link #clear},
* {@link #append(CharSequence)},
* {@link #setOffset},
* {@link #setType} on Token.DEFAULT_TYPE
* @return this Token instance */
public Token reinit(String newTerm, int newStartOffset, int newEndOffset) {
checkOffsets(newStartOffset, newEndOffset);
clear();
append(newTerm);
startOffset = newStartOffset;
endOffset = newEndOffset;
type = DEFAULT_TYPE;
return this;
}
/** Shorthand for calling {@link #clear},
* {@link #append(CharSequence, int, int)},
* {@link #setOffset},
* {@link #setType} on Token.DEFAULT_TYPE
* @return this Token instance */
public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
checkOffsets(newStartOffset, newEndOffset);
clear();
append(newTerm, newTermOffset, newTermOffset + newTermLength);
startOffset = newStartOffset;
endOffset = newEndOffset;
type = DEFAULT_TYPE;
return this;
}
/**
* Copy the prototype token's fields into this one. Note: Payloads are shared.
* @param prototype source Token to copy fields from
*/
public void reinit(Token prototype) {
copyBuffer(prototype.buffer(), 0, prototype.length());
positionIncrement = prototype.positionIncrement;
flags = prototype.flags;
startOffset = prototype.startOffset;
endOffset = prototype.endOffset;
type = prototype.type;
payload = prototype.payload;
}
/**
* Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.
* @param prototype existing Token
* @param newTerm new term text
*/
public void reinit(Token prototype, String newTerm) {
setEmpty().append(newTerm);
positionIncrement = prototype.positionIncrement;
flags = prototype.flags;
startOffset = prototype.startOffset;
endOffset = prototype.endOffset;
type = prototype.type;
payload = prototype.payload;
}
/**
* Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.
* @param prototype existing Token
* @param newTermBuffer buffer containing new term text
* @param offset the index in the buffer of the first character
* @param length number of valid characters in the buffer
*/
public void reinit(Token prototype, char[] newTermBuffer, int offset, int length) {
copyBuffer(newTermBuffer, offset, length);
positionIncrement = prototype.positionIncrement;
flags = prototype.flags;
startOffset = prototype.startOffset;
endOffset = prototype.endOffset;
type = prototype.type;
payload = prototype.payload;
}
@Override
public void copyTo(AttributeImpl target) {
if (target instanceof Token) {
final Token to = (Token) target;
to.reinit(this);
// reinit shares the payload, so clone it:
if (payload !=null) {
to.payload = payload.clone();
}
} else {
super.copyTo(target);
((OffsetAttribute) target).setOffset(startOffset, endOffset);
((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement);
((PayloadAttribute) target).setPayload((payload == null) ? null : payload.clone());
((FlagsAttribute) target).setFlags(flags);
((TypeAttribute) target).setType(type);
}
}
@Override
public void reflectWith(AttributeReflector reflector) {
super.reflectWith(reflector);
reflector.reflect(OffsetAttribute.class, "startOffset", startOffset);
reflector.reflect(OffsetAttribute.class, "endOffset", endOffset);
reflector.reflect(PositionIncrementAttribute.class, "positionIncrement", positionIncrement);
reflector.reflect(PayloadAttribute.class, "payload", payload);
reflector.reflect(FlagsAttribute.class, "flags", flags);
reflector.reflect(TypeAttribute.class, "type", type);
}
private void checkOffsets(int startOffset, int endOffset) {
if (startOffset < 0 || endOffset < startOffset) {
throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, "
+ "startOffset=" + startOffset + ",endOffset=" + endOffset);
}
}
/** Convenience factory that returns <code>Token</code> as implementation for the basic
* attributes and return the default impl (with &quot;Impl&quot; appended) for all other
* attributes.
* @since 3.0
*/
public static final AttributeSource.AttributeFactory TOKEN_ATTRIBUTE_FACTORY =
new TokenAttributeFactory(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
/** <b>Expert:</b> Creates a TokenAttributeFactory returning {@link Token} as instance for the basic attributes
* and for all other attributes calls the given delegate factory.
* @since 3.0
*/
public static final class TokenAttributeFactory extends AttributeSource.AttributeFactory {
private final AttributeSource.AttributeFactory delegate;
/** <b>Expert</b>: Creates an AttributeFactory returning {@link Token} as instance for the basic attributes
* and for all other attributes calls the given delegate factory. */
public TokenAttributeFactory(AttributeSource.AttributeFactory delegate) {
this.delegate = delegate;
}
@Override
public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
return attClass.isAssignableFrom(Token.class)
? new Token() : delegate.createAttributeInstance(attClass);
}
@Override
public boolean equals(Object other) {
if (this == other) return true;
if (other instanceof TokenAttributeFactory) {
final TokenAttributeFactory af = (TokenAttributeFactory) other;
return this.delegate.equals(af.delegate);
}
return false;
}
@Override
public int hashCode() {
return delegate.hashCode() ^ 0x0a45aa31;
}
}
}

72
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/TokenFilter.java

@ -0,0 +1,72 @@
package com.fr.third.org.apache.lucene.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
/** A TokenFilter is a TokenStream whose input is another TokenStream.
<p>
This is an abstract class; subclasses must override {@link #incrementToken()}.
@see TokenStream
*/
public abstract class TokenFilter extends TokenStream {
/** The source of tokens for this filter. */
protected final TokenStream input;
/** Construct a token stream filtering the given input. */
protected TokenFilter(TokenStream input) {
super(input);
this.input = input;
}
/**
* {@inheritDoc}
* <p>
* <b>NOTE:</b>
* The default implementation chains the call to the input TokenStream, so
* be sure to call <code>super.end()</code> first when overriding this method.
*/
@Override
public void end() throws IOException {
input.end();
}
/**
* {@inheritDoc}
* <p>
* <b>NOTE:</b>
* The default implementation chains the call to the input TokenStream, so
* be sure to call <code>super.close()</code> when overriding this method.
*/
@Override
public void close() throws IOException {
input.close();
}
/**
* {@inheritDoc}
* <p>
* <b>NOTE:</b>
* The default implementation chains the call to the input TokenStream, so
* be sure to call <code>super.reset()</code> when overriding this method.
*/
@Override
public void reset() throws IOException {
input.reset();
}
}

181
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/TokenStream.java

@ -0,0 +1,181 @@
package com.fr.third.org.apache.lucene.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Closeable;
import java.lang.reflect.Modifier;
import com.fr.third.org.apache.lucene.document.Document;
import com.fr.third.org.apache.lucene.document.Field;
import com.fr.third.org.apache.lucene.index.IndexWriter;
import com.fr.third.org.apache.lucene.util.Attribute;
import com.fr.third.org.apache.lucene.util.AttributeImpl;
import com.fr.third.org.apache.lucene.util.AttributeSource;
/**
* A <code>TokenStream</code> enumerates the sequence of tokens, either from
* {@link Field}s of a {@link Document} or from query text.
* <p>
* This is an abstract class; concrete subclasses are:
* <ul>
* <li>{@link Tokenizer}, a <code>TokenStream</code> whose input is a Reader; and
* <li>{@link TokenFilter}, a <code>TokenStream</code> whose input is another
* <code>TokenStream</code>.
* </ul>
* A new <code>TokenStream</code> API has been introduced with Lucene 2.9. This API
* has moved from being {@link Token}-based to {@link Attribute}-based. While
* {@link Token} still exists in 2.9 as a convenience class, the preferred way
* to store the information of a {@link Token} is to use {@link AttributeImpl}s.
* <p>
* <code>TokenStream</code> now extends {@link AttributeSource}, which provides
* access to all of the token {@link Attribute}s for the <code>TokenStream</code>.
* Note that only one instance per {@link AttributeImpl} is created and reused
* for every token. This approach reduces object creation and allows local
* caching of references to the {@link AttributeImpl}s. See
* {@link #incrementToken()} for further details.
* <p>
* <b>The workflow of the new <code>TokenStream</code> API is as follows:</b>
* <ol>
* <li>Instantiation of <code>TokenStream</code>/{@link TokenFilter}s which add/get
* attributes to/from the {@link AttributeSource}.
* <li>The consumer calls {@link TokenStream#reset()}.
* <li>The consumer retrieves attributes from the stream and stores local
* references to all attributes it wants to access.
* <li>The consumer calls {@link #incrementToken()} until it returns false
* consuming the attributes after each call.
* <li>The consumer calls {@link #end()} so that any end-of-stream operations
* can be performed.
* <li>The consumer calls {@link #close()} to release any resource when finished
* using the <code>TokenStream</code>.
* </ol>
* To make sure that filters and consumers know which attributes are available,
* the attributes must be added during instantiation. Filters and consumers are
* not required to check for availability of attributes in
* {@link #incrementToken()}.
* <p>
* You can find some example code for the new API in the analysis package level
* Javadoc.
* <p>
* Sometimes it is desirable to capture a current state of a <code>TokenStream</code>,
* e.g., for buffering purposes (see {@link CachingTokenFilter},
* TeeSinkTokenFilter). For this usecase
* {@link AttributeSource#captureState} and {@link AttributeSource#restoreState}
* can be used.
* <p>The {@code TokenStream}-API in Lucene is based on the decorator pattern.
* Therefore all non-abstract subclasses must be final or have at least a final
* implementation of {@link #incrementToken}! This is checked when Java
* assertions are enabled.
*/
public abstract class TokenStream extends AttributeSource implements Closeable {
/**
* A TokenStream using the default attribute factory.
*/
protected TokenStream() {
super();
assert assertFinal();
}
/**
* A TokenStream that uses the same attributes as the supplied one.
*/
protected TokenStream(AttributeSource input) {
super(input);
assert assertFinal();
}
/**
* A TokenStream using the supplied AttributeFactory for creating new {@link Attribute} instances.
*/
protected TokenStream(AttributeFactory factory) {
super(factory);
assert assertFinal();
}
private boolean assertFinal() {
try {
final Class<?> clazz = getClass();
if (!clazz.desiredAssertionStatus())
return true;
assert clazz.isAnonymousClass() ||
(clazz.getModifiers() & (Modifier.FINAL | Modifier.PRIVATE)) != 0 ||
Modifier.isFinal(clazz.getMethod("incrementToken").getModifiers()) :
"TokenStream implementation classes or at least their incrementToken() implementation must be final";
return true;
} catch (NoSuchMethodException nsme) {
return false;
}
}
/**
* Consumers (i.e., {@link IndexWriter}) use this method to advance the stream to
* the next token. Implementing classes must implement this method and update
* the appropriate {@link AttributeImpl}s with the attributes of the next
* token.
* <P>
* The producer must make no assumptions about the attributes after the method
* has been returned: the caller may arbitrarily change it. If the producer
* needs to preserve the state for subsequent calls, it can use
* {@link #captureState} to create a copy of the current attribute state.
* <p>
* This method is called for every token of a document, so an efficient
* implementation is crucial for good performance. To avoid calls to
* {@link #addAttribute(Class)} and {@link #getAttribute(Class)},
* references to all {@link AttributeImpl}s that this stream uses should be
* retrieved during instantiation.
* <p>
* To ensure that filters and consumers know which attributes are available,
* the attributes must be added during instantiation. Filters and consumers
* are not required to check for availability of attributes in
* {@link #incrementToken()}.
*
* @return false for end of stream; true otherwise
*/
public abstract boolean incrementToken() throws IOException;
/**
* This method is called by the consumer after the last token has been
* consumed, after {@link #incrementToken()} returned <code>false</code>
* (using the new <code>TokenStream</code> API). Streams implementing the old API
* should upgrade to use this feature.
* <p/>
* This method can be used to perform any end-of-stream operations, such as
* setting the final offset of a stream. The final offset of a stream might
* differ from the offset of the last token eg in case one or more whitespaces
* followed after the last token, but a WhitespaceTokenizer was used.
*
* @throws IOException If an I/O error occurs
*/
public void end() throws IOException {
// do nothing by default
}
/**
* This method is called by a consumer before it begins consumption using
* {@link #incrementToken()}.
* <p/>
* Resets this stream to a clean state. Stateful implementations must implement
* this method so that they can be reused, just as if they had been created fresh.
*/
public void reset() throws IOException {}
/** Releases resources associated with this stream. */
public void close() throws IOException {}
}

99
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/Tokenizer.java

@ -0,0 +1,99 @@
package com.fr.third.org.apache.lucene.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.fr.third.org.apache.lucene.util.AttributeSource;
import java.io.Reader;
import java.io.IOException;
/** A Tokenizer is a TokenStream whose input is a Reader.
<p>
This is an abstract class; subclasses must override {@link #incrementToken()}
<p>
NOTE: Subclasses overriding {@link #incrementToken()} must
call {@link AttributeSource#clearAttributes()} before
setting attributes.
*/
public abstract class Tokenizer extends TokenStream {
/** The text source for this Tokenizer. */
protected Reader input;
/** Construct a token stream processing the given input. */
protected Tokenizer(Reader input) {
assert input != null: "input must not be null";
this.input = input;
}
/** Construct a token stream processing the given input using the given AttributeFactory. */
protected Tokenizer(AttributeFactory factory, Reader input) {
super(factory);
assert input != null: "input must not be null";
this.input = input;
}
/** Construct a token stream processing the given input using the given AttributeSource. */
protected Tokenizer(AttributeSource source, Reader input) {
super(source);
assert input != null: "input must not be null";
this.input = input;
}
/**
* {@inheritDoc}
* <p>
* <b>NOTE:</b>
* The default implementation closes the input Reader, so
* be sure to call <code>super.close()</code> when overriding this method.
*/
@Override
public void close() throws IOException {
if (input != null) {
input.close();
// LUCENE-2387: don't hold onto Reader after close, so
// GC can reclaim
input = null;
}
}
/** Return the corrected offset. If {@link #input} is a {@link CharFilter} subclass
* this method calls {@link CharFilter#correctOffset}, else returns <code>currentOff</code>.
* @param currentOff offset as seen in the output
* @return corrected offset based on the input
* @see CharFilter#correctOffset
*/
protected final int correctOffset(int currentOff) {
assert input != null: "this tokenizer is closed";
return (input instanceof CharFilter) ? ((CharFilter) input).correctOffset(currentOff) : currentOff;
}
/** Expert: Set a new reader on the Tokenizer. Typically, an
* analyzer (in its tokenStream method) will use
* this to re-use a previously created tokenizer. */
public final void setReader(Reader input) throws IOException {
assert input != null: "input must not be null";
this.input = input;
assert setReaderTestPoint();
}
// only used by assert, for testing
boolean setReaderTestPoint() {
return true;
}
}

153
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicAnalyzer.java

@ -0,0 +1,153 @@
package com.fr.third.org.apache.lucene.analysis.ar;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import com.fr.third.org.apache.lucene.analysis.Analyzer;
import com.fr.third.org.apache.lucene.analysis.core.LowerCaseFilter;
import com.fr.third.org.apache.lucene.analysis.core.StopFilter;
import com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer;
import com.fr.third.org.apache.lucene.analysis.util.CharArraySet;
import com.fr.third.org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import com.fr.third.org.apache.lucene.analysis.TokenStream;
import com.fr.third.org.apache.lucene.analysis.Tokenizer;
import com.fr.third.org.apache.lucene.util.Version;
/**
* {@link Analyzer} for Arabic.
* <p>
* This analyzer implements light-stemming as specified by:
* <i>
* Light Stemming for Arabic Information Retrieval
* </i>
* http://www.mtholyoke.edu/~lballest/Pubs/arab_stem05.pdf
* <p>
* The analysis package contains three primary components:
* <ul>
* <li>{@link ArabicNormalizationFilter}: Arabic orthographic normalization.
* <li>{@link ArabicStemFilter}: Arabic light stemming
* <li>Arabic stop words file: a set of default Arabic stop words.
* </ul>
*
*/
public final class ArabicAnalyzer extends StopwordAnalyzerBase {
/**
* File containing default Arabic stopwords.
*
* Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html
* The stopword list is BSD-Licensed.
*/
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
*/
public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
/**
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
* accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
static final CharArraySet DEFAULT_STOP_SET;
static {
try {
DEFAULT_STOP_SET = loadStopwordSet(false, ArabicAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
}
private final CharArraySet stemExclusionSet;
/**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/
public ArabicAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
* Builds an analyzer with the given stop words
*
* @param matchVersion
* lucene compatibility version
* @param stopwords
* a stopword set
*/
public ArabicAnalyzer(Version matchVersion, CharArraySet stopwords){
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
}
/**
* Builds an analyzer with the given stop word. If a none-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* {@link ArabicStemFilter}.
*
* @param matchVersion
* lucene compatibility version
* @param stopwords
* a stopword set
* @param stemExclusionSet
* a set of terms not to be stemmed
*/
public ArabicAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){
super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
}
/**
* Creates
* {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* used to tokenize all the text in the provided {@link Reader}.
*
* @return {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link LowerCaseFilter}, {@link StopFilter},
* {@link ArabicNormalizationFilter}, {@link KeywordMarkerFilter}
* if a stem exclusion set is provided and {@link ArabicStemFilter}.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = matchVersion.onOrAfter(Version.LUCENE_31) ?
new StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader);
TokenStream result = new LowerCaseFilter(matchVersion, source);
// the order here is important: the stopword list is not normalized!
result = new StopFilter( matchVersion, result, stopwords);
// TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?!
result = new ArabicNormalizationFilter(result);
if(!stemExclusionSet.isEmpty()) {
result = new KeywordMarkerFilter(result, stemExclusionSet);
}
return new TokenStreamComponents(source, new ArabicStemFilter(result));
}
}

96
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java

@ -0,0 +1,96 @@
package com.fr.third.org.apache.lucene.analysis.ar;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import com.fr.third.org.apache.lucene.analysis.core.LetterTokenizer;
import com.fr.third.org.apache.lucene.analysis.util.CharTokenizer;
import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer; // javadoc @link
import com.fr.third.org.apache.lucene.util.AttributeSource;
import com.fr.third.org.apache.lucene.util.Version;
/**
* Tokenizer that breaks text into runs of letters and diacritics.
* <p>
* The problem with the standard Letter tokenizer is that it fails on diacritics.
* Handling similar to this is necessary for Indic Scripts, Hebrew, Thaana, etc.
* </p>
* <p>
* <a name="version"/>
* You must specify the required {@link Version} compatibility when creating
* {@link ArabicLetterTokenizer}:
* <ul>
* <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
* detect token characters. See {@link #isTokenChar(int)} and
* {@link #normalize(int)} for details.</li>
* </ul>
* @deprecated (3.1) Use {@link StandardTokenizer} instead.
*/
@Deprecated
public class ArabicLetterTokenizer extends LetterTokenizer {
/**
* Construct a new ArabicLetterTokenizer.
* @param matchVersion Lucene version
* to match See {@link <a href="#version">above</a>}
*
* @param in
* the input to split up into tokens
*/
public ArabicLetterTokenizer(Version matchVersion, Reader in) {
super(matchVersion, in);
}
/**
* Construct a new ArabicLetterTokenizer using a given {@link AttributeSource}.
*
* @param matchVersion
* Lucene version to match See {@link <a href="#version">above</a>}
* @param source
* the attribute source to use for this Tokenizer
* @param in
* the input to split up into tokens
*/
public ArabicLetterTokenizer(Version matchVersion, AttributeSource source, Reader in) {
super(matchVersion, source, in);
}
/**
* Construct a new ArabicLetterTokenizer using a given
* {@link AttributeSource.AttributeFactory}. * @param
* matchVersion Lucene version to match See
* {@link <a href="#version">above</a>}
*
* @param factory
* the attribute factory to use for this Tokenizer
* @param in
* the input to split up into tokens
*/
public ArabicLetterTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
super(matchVersion, factory, in);
}
/**
* Allows for Letter category or NonspacingMark category
* @see com.fr.third.org.apache.lucene.analysis.core.LetterTokenizer#isTokenChar(int)
*/
@Override
protected boolean isTokenChar(int c) {
return super.isTokenChar(c) || Character.getType(c) == Character.NON_SPACING_MARK;
}
}

43
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicLetterTokenizerFactory.java

@ -0,0 +1,43 @@
package com.fr.third.org.apache.lucene.analysis.ar;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.fr.third.org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
import com.fr.third.org.apache.lucene.analysis.util.TokenizerFactory;
import java.io.Reader;
import java.util.Map;
/**
* Factory for {@link ArabicLetterTokenizer}
* @deprecated (3.1) Use StandardTokenizerFactory instead.
**/
@Deprecated
public class ArabicLetterTokenizerFactory extends TokenizerFactory {
@Override
public void init(Map<String,String> args) {
super.init(args);
assureMatchVersion();
}
public ArabicLetterTokenizer create(Reader input) {
return new ArabicLetterTokenizer(luceneMatchVersion, input);
}
}

48
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java

@ -0,0 +1,48 @@
package com.fr.third.org.apache.lucene.analysis.ar;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import com.fr.third.org.apache.lucene.analysis.TokenFilter;
import com.fr.third.org.apache.lucene.analysis.TokenStream;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* A {@link TokenFilter} that applies {@link ArabicNormalizer} to normalize the orthography.
*
*/
public final class ArabicNormalizationFilter extends TokenFilter {
private final ArabicNormalizer normalizer = new ArabicNormalizer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
public ArabicNormalizationFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
int newlen = normalizer.normalize(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
return true;
}
return false;
}
}

48
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicNormalizationFilterFactory.java

@ -0,0 +1,48 @@
package com.fr.third.org.apache.lucene.analysis.ar;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.fr.third.org.apache.lucene.analysis.TokenStream;
import com.fr.third.org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import com.fr.third.org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import com.fr.third.org.apache.lucene.analysis.util.MultiTermAwareComponent;
import com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link ArabicNormalizationFilter}.
* <pre class="prettyprint" >
* &lt;fieldType name="text_arnormal" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.ArabicNormalizationFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
*/
public class ArabicNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
public ArabicNormalizationFilter create(TokenStream input) {
return new ArabicNormalizationFilter(input);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
}
}

101
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicNormalizer.java

@ -0,0 +1,101 @@
package com.fr.third.org.apache.lucene.analysis.ar;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static com.fr.third.org.apache.lucene.analysis.util.StemmerUtil.*;
/**
* Normalizer for Arabic.
* <p>
* Normalization is done in-place for efficiency, operating on a termbuffer.
* <p>
* Normalization is defined as:
* <ul>
* <li> Normalization of hamza with alef seat to a bare alef.
* <li> Normalization of teh marbuta to heh
* <li> Normalization of dotless yeh (alef maksura) to yeh.
* <li> Removal of Arabic diacritics (the harakat)
* <li> Removal of tatweel (stretching character).
* </ul>
*
*/
public class ArabicNormalizer {
public static final char ALEF = '\u0627';
public static final char ALEF_MADDA = '\u0622';
public static final char ALEF_HAMZA_ABOVE = '\u0623';
public static final char ALEF_HAMZA_BELOW = '\u0625';
public static final char YEH = '\u064A';
public static final char DOTLESS_YEH = '\u0649';
public static final char TEH_MARBUTA = '\u0629';
public static final char HEH = '\u0647';
public static final char TATWEEL = '\u0640';
public static final char FATHATAN = '\u064B';
public static final char DAMMATAN = '\u064C';
public static final char KASRATAN = '\u064D';
public static final char FATHA = '\u064E';
public static final char DAMMA = '\u064F';
public static final char KASRA = '\u0650';
public static final char SHADDA = '\u0651';
public static final char SUKUN = '\u0652';
/**
* Normalize an input buffer of Arabic text
*
* @param s input buffer
* @param len length of input buffer
* @return length of input buffer after normalization
*/
public int normalize(char s[], int len) {
for (int i = 0; i < len; i++) {
switch (s[i]) {
case ALEF_MADDA:
case ALEF_HAMZA_ABOVE:
case ALEF_HAMZA_BELOW:
s[i] = ALEF;
break;
case DOTLESS_YEH:
s[i] = YEH;
break;
case TEH_MARBUTA:
s[i] = HEH;
break;
case TATWEEL:
case KASRATAN:
case DAMMATAN:
case FATHATAN:
case FATHA:
case DAMMA:
case KASRA:
case SHADDA:
case SUKUN:
len = delete(s, i, len);
i--;
break;
default:
break;
}
}
return len;
}
}

58
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicStemFilter.java

@ -0,0 +1,58 @@
package com.fr.third.org.apache.lucene.analysis.ar;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // javadoc @link
import com.fr.third.org.apache.lucene.analysis.TokenFilter;
import com.fr.third.org.apache.lucene.analysis.TokenStream;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* A {@link TokenFilter} that applies {@link ArabicStemmer} to stem Arabic words..
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
* @see KeywordMarkerFilter */
public final class ArabicStemFilter extends TokenFilter {
private final ArabicStemmer stemmer = new ArabicStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public ArabicStemFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if(!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

43
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicStemFilterFactory.java

@ -0,0 +1,43 @@
package com.fr.third.org.apache.lucene.analysis.ar;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.fr.third.org.apache.lucene.analysis.TokenStream;
import com.fr.third.org.apache.lucene.analysis.ar.ArabicStemFilter;
import com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link ArabicStemFilter}.
* <pre class="prettyprint" >
* &lt;fieldType name="text_arstem" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.ArabicNormalizationFilterFactory"/&gt;
* &lt;filter class="solr.ArabicStemFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
*/
public class ArabicStemFilterFactory extends TokenFilterFactory {
public ArabicStemFilter create(TokenStream input) {
return new ArabicStemFilter(input);
}
}

150
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicStemmer.java

@ -0,0 +1,150 @@
package com.fr.third.org.apache.lucene.analysis.ar;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static com.fr.third.org.apache.lucene.analysis.util.StemmerUtil.*;
/**
* Stemmer for Arabic.
* <p>
* Stemming is done in-place for efficiency, operating on a termbuffer.
* <p>
* Stemming is defined as:
* <ul>
* <li> Removal of attached definite article, conjunction, and prepositions.
* <li> Stemming of common suffixes.
* </ul>
*
*/
public class ArabicStemmer {
public static final char ALEF = '\u0627';
public static final char BEH = '\u0628';
public static final char TEH_MARBUTA = '\u0629';
public static final char TEH = '\u062A';
public static final char FEH = '\u0641';
public static final char KAF = '\u0643';
public static final char LAM = '\u0644';
public static final char NOON = '\u0646';
public static final char HEH = '\u0647';
public static final char WAW = '\u0648';
public static final char YEH = '\u064A';
public static final char prefixes[][] = {
("" + ALEF + LAM).toCharArray(),
("" + WAW + ALEF + LAM).toCharArray(),
("" + BEH + ALEF + LAM).toCharArray(),
("" + KAF + ALEF + LAM).toCharArray(),
("" + FEH + ALEF + LAM).toCharArray(),
("" + LAM + LAM).toCharArray(),
("" + WAW).toCharArray(),
};
public static final char suffixes[][] = {
("" + HEH + ALEF).toCharArray(),
("" + ALEF + NOON).toCharArray(),
("" + ALEF + TEH).toCharArray(),
("" + WAW + NOON).toCharArray(),
("" + YEH + NOON).toCharArray(),
("" + YEH + HEH).toCharArray(),
("" + YEH + TEH_MARBUTA).toCharArray(),
("" + HEH).toCharArray(),
("" + TEH_MARBUTA).toCharArray(),
("" + YEH).toCharArray(),
};
/**
* Stem an input buffer of Arabic text.
*
* @param s input buffer
* @param len length of input buffer
* @return length of input buffer after normalization
*/
public int stem(char s[], int len) {
len = stemPrefix(s, len);
len = stemSuffix(s, len);
return len;
}
/**
* Stem a prefix off an Arabic word.
* @param s input buffer
* @param len length of input buffer
* @return new length of input buffer after stemming.
*/
public int stemPrefix(char s[], int len) {
for (int i = 0; i < prefixes.length; i++)
if (startsWithCheckLength(s, len, prefixes[i]))
return deleteN(s, 0, len, prefixes[i].length);
return len;
}
/**
* Stem suffix(es) off an Arabic word.
* @param s input buffer
* @param len length of input buffer
* @return new length of input buffer after stemming
*/
public int stemSuffix(char s[], int len) {
for (int i = 0; i < suffixes.length; i++)
if (endsWithCheckLength(s, len, suffixes[i]))
len = deleteN(s, len - suffixes[i].length, len, suffixes[i].length);
return len;
}
/**
* Returns true if the prefix matches and can be stemmed
* @param s input buffer
* @param len length of input buffer
* @param prefix prefix to check
* @return true if the prefix matches and can be stemmed
*/
boolean startsWithCheckLength(char s[], int len, char prefix[]) {
if (prefix.length == 1 && len < 4) { // wa- prefix requires at least 3 characters
return false;
} else if (len < prefix.length + 2) { // other prefixes require only 2.
return false;
} else {
for (int i = 0; i < prefix.length; i++)
if (s[i] != prefix[i])
return false;
return true;
}
}
/**
* Returns true if the suffix matches and can be stemmed
* @param s input buffer
* @param len length of input buffer
* @param suffix suffix to check
* @return true if the suffix matches and can be stemmed
*/
boolean endsWithCheckLength(char s[], int len, char suffix[]) {
if (len < suffix.length + 2) { // all suffixes require at least 2 characters after stemming
return false;
} else {
for (int i = 0; i < suffix.length; i++)
if (s[len - suffix.length + i] != suffix[i])
return false;
return true;
}
}
}

22
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/package.html

@ -0,0 +1,22 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Analyzer for Arabic.
</body>
</html>

131
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java

@ -0,0 +1,131 @@
package com.fr.third.org.apache.lucene.analysis.bg;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.Set;
import com.fr.third.org.apache.lucene.analysis.Analyzer;
import com.fr.third.org.apache.lucene.analysis.core.LowerCaseFilter;
import com.fr.third.org.apache.lucene.analysis.core.StopFilter;
import com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import com.fr.third.org.apache.lucene.analysis.TokenStream;
import com.fr.third.org.apache.lucene.analysis.Tokenizer;
import com.fr.third.org.apache.lucene.analysis.standard.StandardFilter;
import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer;
import com.fr.third.org.apache.lucene.analysis.util.CharArraySet;
import com.fr.third.org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import com.fr.third.org.apache.lucene.util.Version;
/**
* {@link Analyzer} for Bulgarian.
* <p>
* This analyzer implements light-stemming as specified by: <i> Searching
* Strategies for the Bulgarian Language </i>
* http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
* <p>
*/
public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
/**
* File containing default Bulgarian stopwords.
*
* Default stopword list is from
* http://members.unine.ch/jacques.savoy/clef/index.html The stopword list is
* BSD-Licensed.
*/
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
* Returns an unmodifiable instance of the default stop-words set.
*
* @return an unmodifiable instance of the default stop-words set.
*/
public static CharArraySet getDefaultStopSet() {
return DefaultSetHolder.DEFAULT_STOP_SET;
}
/**
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer
* class accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
static final CharArraySet DEFAULT_STOP_SET;
static {
try {
DEFAULT_STOP_SET = loadStopwordSet(false, BulgarianAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
}
private final CharArraySet stemExclusionSet;
/**
* Builds an analyzer with the default stop words:
* {@link #DEFAULT_STOPWORD_FILE}.
*/
public BulgarianAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
* Builds an analyzer with the given stop words.
*/
public BulgarianAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
}
/**
* Builds an analyzer with the given stop words and a stem exclusion set.
* If a stem exclusion set is provided this analyzer will add a {@link KeywordMarkerFilter}
* before {@link BulgarianStemFilter}.
*/
public BulgarianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet)); }
/**
* Creates a
* {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* which tokenizes all the text in the provided {@link Reader}.
*
* @return A
* {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
* , {@link KeywordMarkerFilter} if a stem exclusion set is
* provided and {@link BulgarianStemFilter}.
*/
@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new BulgarianStemFilter(result);
return new TokenStreamComponents(source, result);
}
}

58
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/bg/BulgarianStemFilter.java

@ -0,0 +1,58 @@
package com.fr.third.org.apache.lucene.analysis.bg;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // for javadoc
import com.fr.third.org.apache.lucene.analysis.TokenFilter;
import com.fr.third.org.apache.lucene.analysis.TokenStream;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* A {@link TokenFilter} that applies {@link BulgarianStemmer} to stem Bulgarian
* words.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
*/
public final class BulgarianStemFilter extends TokenFilter {
private final BulgarianStemmer stemmer = new BulgarianStemmer();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
public BulgarianStemFilter(final TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if(!keywordAttr.isKeyword()) {
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
termAtt.setLength(newlen);
}
return true;
} else {
return false;
}
}
}

40
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/bg/BulgarianStemFilterFactory.java

@ -0,0 +1,40 @@
package com.fr.third.org.apache.lucene.analysis.bg;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.fr.third.org.apache.lucene.analysis.TokenStream;
import com.fr.third.org.apache.lucene.analysis.bg.BulgarianStemFilter;
import com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link BulgarianStemFilter}.
* <pre class="prettyprint" >
* &lt;fieldType name="text_bgstem" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.BulgarianStemFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
*/
public class BulgarianStemFilterFactory extends TokenFilterFactory {
public TokenStream create(TokenStream input) {
return new BulgarianStemFilter(input);
}
}

143
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/bg/BulgarianStemmer.java

@ -0,0 +1,143 @@
package com.fr.third.org.apache.lucene.analysis.bg;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static com.fr.third.org.apache.lucene.analysis.util.StemmerUtil.*;
/**
* Light Stemmer for Bulgarian.
* <p>
* Implements the algorithm described in:
* <i>
* Searching Strategies for the Bulgarian Language
* </i>
* http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
*/
public class BulgarianStemmer {
/**
* Stem an input buffer of Bulgarian text.
*
* @param s input buffer
* @param len length of input buffer
* @return length of input buffer after normalization
*/
public int stem(final char s[], int len) {
if (len < 4) // do not stem
return len;
if (len > 5 && endsWith(s, len, "ища"))
return len - 3;
len = removeArticle(s, len);
len = removePlural(s, len);
if (len > 3) {
if (endsWith(s, len, "я"))
len--;
if (endsWith(s, len, "а") ||
endsWith(s, len, "о") ||
endsWith(s, len, "е"))
len--;
}
// the rule to rewrite ен -> н is duplicated in the paper.
// in the perl implementation referenced by the paper, this is fixed.
// (it is fixed here as well)
if (len > 4 && endsWith(s, len, "ен")) {
s[len - 2] = 'н'; // replace with н
len--;
}
if (len > 5 && s[len - 2] == 'ъ') {
s[len - 2] = s[len - 1]; // replace ъN with N
len--;
}
return len;
}
/**
* Mainly remove the definite article
* @param s input buffer
* @param len length of input buffer
* @return new stemmed length
*/
private int removeArticle(final char s[], final int len) {
if (len > 6 && endsWith(s, len, "ият"))
return len - 3;
if (len > 5) {
if (endsWith(s, len, "ът") ||
endsWith(s, len, "то") ||
endsWith(s, len, "те") ||
endsWith(s, len, "та") ||
endsWith(s, len, "ия"))
return len - 2;
}
if (len > 4 && endsWith(s, len, "ят"))
return len - 2;
return len;
}
private int removePlural(final char s[], final int len) {
if (len > 6) {
if (endsWith(s, len, "овци"))
return len - 3; // replace with о
if (endsWith(s, len, "ове"))
return len - 3;
if (endsWith(s, len, "еве")) {
s[len - 3] = 'й'; // replace with й
return len - 2;
}
}
if (len > 5) {
if (endsWith(s, len, "ища"))
return len - 3;
if (endsWith(s, len, "та"))
return len - 2;
if (endsWith(s, len, "ци")) {
s[len - 2] = 'к'; // replace with к
return len - 1;
}
if (endsWith(s, len, "зи")) {
s[len - 2] = 'г'; // replace with г
return len - 1;
}
if (s[len - 3] == 'е' && s[len - 1] == 'и') {
s[len - 3] = 'я'; // replace е with я, remove и
return len - 1;
}
}
if (len > 4) {
if (endsWith(s, len, "си")) {
s[len - 2] = 'х'; // replace with х
return len - 1;
}
if (endsWith(s, len, "и"))
return len - 1;
}
return len;
}
}

22
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/bg/package.html

@ -0,0 +1,22 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Analyzer for Bulgarian.
</body>
</html>

138
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/br/BrazilianAnalyzer.java

@ -0,0 +1,138 @@
package com.fr.third.org.apache.lucene.analysis.br;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import com.fr.third.org.apache.lucene.analysis.Analyzer;
import com.fr.third.org.apache.lucene.analysis.TokenStream;
import com.fr.third.org.apache.lucene.analysis.Tokenizer;
import com.fr.third.org.apache.lucene.analysis.core.LowerCaseFilter;
import com.fr.third.org.apache.lucene.analysis.core.StopFilter;
import com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import com.fr.third.org.apache.lucene.analysis.standard.StandardAnalyzer;
import com.fr.third.org.apache.lucene.analysis.standard.StandardFilter;
import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer;
import com.fr.third.org.apache.lucene.analysis.util.CharArraySet;
import com.fr.third.org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import com.fr.third.org.apache.lucene.analysis.util.WordlistLoader;
import com.fr.third.org.apache.lucene.util.IOUtils;
import com.fr.third.org.apache.lucene.util.Version;
/**
* {@link Analyzer} for Brazilian Portuguese language.
* <p>
* Supports an external list of stopwords (words that
* will not be indexed at all) and an external list of exclusions (words that will
* not be stemmed, but indexed).
* </p>
*
* <p><b>NOTE</b>: This class uses the same {@link Version}
* dependent settings as {@link StandardAnalyzer}.</p>
*/
public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
/** File containing default Brazilian Portuguese stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
*/
public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
private static class DefaultSetHolder {
static final CharArraySet DEFAULT_STOP_SET;
static {
try {
DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(BrazilianAnalyzer.class,
DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
}
/**
* Contains words that should be indexed but not stemmed.
*/
private CharArraySet excltable = CharArraySet.EMPTY_SET;
/**
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
*/
public BrazilianAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
* Builds an analyzer with the given stop words
*
* @param matchVersion
* lucene compatibility version
* @param stopwords
* a stopword set
*/
public BrazilianAnalyzer(Version matchVersion, CharArraySet stopwords) {
super(matchVersion, stopwords);
}
/**
* Builds an analyzer with the given stop words and stemming exclusion words
*
* @param matchVersion
* lucene compatibility version
* @param stopwords
* a stopword set
*/
public BrazilianAnalyzer(Version matchVersion, CharArraySet stopwords,
CharArraySet stemExclusionSet) {
this(matchVersion, stopwords);
excltable = CharArraySet.unmodifiableSet(CharArraySet
.copy(matchVersion, stemExclusionSet));
}
/**
* Creates
* {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* used to tokenize all the text in the provided {@link Reader}.
*
* @return {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from a {@link StandardTokenizer} filtered with
* {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}
* , and {@link BrazilianStemFilter}.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new LowerCaseFilter(matchVersion, source);
result = new StandardFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(excltable != null && !excltable.isEmpty())
result = new KeywordMarkerFilter(result, excltable);
return new TokenStreamComponents(source, new BrazilianStemFilter(result));
}
}

76
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/br/BrazilianStemFilter.java

@ -0,0 +1,76 @@
package com.fr.third.org.apache.lucene.analysis.br;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Set;
import com.fr.third.org.apache.lucene.analysis.TokenFilter;
import com.fr.third.org.apache.lucene.analysis.TokenStream;
import com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
/**
* A {@link TokenFilter} that applies {@link BrazilianStemmer}.
* <p>
* To prevent terms from being stemmed use an instance of
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
* the {@link KeywordAttribute} before this {@link TokenStream}.
* </p>
* @see KeywordMarkerFilter
*
*/
public final class BrazilianStemFilter extends TokenFilter {
/**
* {@link BrazilianStemmer} in use by this filter.
*/
private BrazilianStemmer stemmer = new BrazilianStemmer();
private Set<?> exclusions = null;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
/**
* Creates a new BrazilianStemFilter
*
* @param in the source {@link TokenStream}
*/
public BrazilianStemFilter(TokenStream in) {
super(in);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
final String term = termAtt.toString();
// Check the exclusion table.
if (!keywordAttr.isKeyword() && (exclusions == null || !exclusions.contains(term))) {
final String s = stemmer.stem(term);
// If not stemmed, don't waste the time adjusting the token.
if ((s != null) && !s.equals(term))
termAtt.setEmpty().append(s);
}
return true;
} else {
return false;
}
}
}

41
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/br/BrazilianStemFilterFactory.java

@ -0,0 +1,41 @@
package com.fr.third.org.apache.lucene.analysis.br;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.fr.third.org.apache.lucene.analysis.TokenStream;
import com.fr.third.org.apache.lucene.analysis.br.BrazilianStemFilter;
import com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link BrazilianStemFilter}.
* <pre class="prettyprint" >
* &lt;fieldType name="text_brstem" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.BrazilianStemFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
*/
public class BrazilianStemFilterFactory extends TokenFilterFactory {
public BrazilianStemFilter create(TokenStream in) {
return new BrazilianStemFilter(in);
}
}

1024
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/br/BrazilianStemmer.java

File diff suppressed because it is too large Load Diff

22
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/br/package.html

@ -0,0 +1,22 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Analyzer for Brazilian Portuguese.
</body>
</html>

148
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ca/CatalanAnalyzer.java

@ -0,0 +1,148 @@
package com.fr.third.org.apache.lucene.analysis.ca;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import com.fr.third.org.apache.lucene.analysis.Analyzer;
import com.fr.third.org.apache.lucene.analysis.core.LowerCaseFilter;
import com.fr.third.org.apache.lucene.analysis.core.StopFilter;
import com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import com.fr.third.org.apache.lucene.analysis.TokenStream;
import com.fr.third.org.apache.lucene.analysis.Tokenizer;
import com.fr.third.org.apache.lucene.analysis.snowball.SnowballFilter;
import com.fr.third.org.apache.lucene.analysis.standard.StandardFilter;
import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer;
import com.fr.third.org.apache.lucene.analysis.util.CharArraySet;
import com.fr.third.org.apache.lucene.analysis.util.ElisionFilter;
import com.fr.third.org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import com.fr.third.org.apache.lucene.util.Version;
import com.fr.third.org.tartarus.snowball.ext.CatalanStemmer;
/**
* {@link Analyzer} for Catalan.
* <p>
* <a name="version"/>
* <p>You must specify the required {@link Version}
* compatibility when creating CatalanAnalyzer:
* <ul>
* <li> As of 3.6, ElisionFilter with a set of Catalan
* contractions is used by default.
* </ul>
*/
public final class CatalanAnalyzer extends StopwordAnalyzerBase {
private final CharArraySet stemExclusionSet;
/** File containing default Catalan stopwords. */
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList(
"d", "l", "m", "n", "s", "t"
), true));
/**
* Returns an unmodifiable instance of the default stop words set.
* @return default stop words set.
*/
public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
/**
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
* accesses the static final set the first time.;
*/
private static class DefaultSetHolder {
static final CharArraySet DEFAULT_STOP_SET;
static {
try {
DEFAULT_STOP_SET = loadStopwordSet(false,
CatalanAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
}
/**
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
*/
public CatalanAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
* Builds an analyzer with the given stop words.
*
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set
*/
public CatalanAnalyzer(Version matchVersion, CharArraySet stopwords) {
this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
}
/**
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
* provided this analyzer will add a {@link KeywordMarkerFilter} before
* stemming.
*
* @param matchVersion lucene compatibility version
* @param stopwords a stopword set
* @param stemExclusionSet a set of terms not to be stemmed
*/
public CatalanAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(matchVersion, stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
matchVersion, stemExclusionSet));
}
/**
* Creates a
* {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* which tokenizes all the text in the provided {@link Reader}.
*
* @return A
* {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link ElisionFilter}, {@link LowerCaseFilter},
* {@link StopFilter}, {@link KeywordMarkerFilter} if a stem exclusion set is
* provided and {@link SnowballFilter}.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
result = new ElisionFilter(result, DEFAULT_ARTICLES);
}
result = new LowerCaseFilter(matchVersion, result);
result = new StopFilter(matchVersion, result, stopwords);
if(!stemExclusionSet.isEmpty())
result = new KeywordMarkerFilter(result, stemExclusionSet);
result = new SnowballFilter(result, new CatalanStemmer());
return new TokenStreamComponents(source, result);
}
}

22
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ca/package.html

@ -0,0 +1,22 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Analyzer for Catalan.
</body>
</html>

110
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/BaseCharFilter.java

@ -0,0 +1,110 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.fr.third.org.apache.lucene.analysis.charfilter;
import com.fr.third.org.apache.lucene.analysis.CharFilter;
import com.fr.third.org.apache.lucene.util.ArrayUtil;
import java.io.Reader;
import java.util.Arrays;
/**
* Base utility class for implementing a {@link CharFilter}.
* You subclass this, and then record mappings by calling
* {@link #addOffCorrectMap}, and then invoke the correct
* method to correct an offset.
*/
public abstract class BaseCharFilter extends CharFilter {
private int offsets[];
private int diffs[];
private int size = 0;
public BaseCharFilter(Reader in) {
super(in);
}
/** Retrieve the corrected offset. */
@Override
protected int correct(int currentOff) {
if (offsets == null || currentOff < offsets[0]) {
return currentOff;
}
int hi = size - 1;
if(currentOff >= offsets[hi])
return currentOff + diffs[hi];
int lo = 0;
int mid = -1;
while (hi >= lo) {
mid = (lo + hi) >>> 1;
if (currentOff < offsets[mid])
hi = mid - 1;
else if (currentOff > offsets[mid])
lo = mid + 1;
else
return currentOff + diffs[mid];
}
if (currentOff < offsets[mid])
return mid == 0 ? currentOff : currentOff + diffs[mid-1];
else
return currentOff + diffs[mid];
}
protected int getLastCumulativeDiff() {
return offsets == null ?
0 : diffs[size-1];
}
/**
* <p>
* Adds an offset correction mapping at the given output stream offset.
* </p>
* <p>
* Assumption: the offset given with each successive call to this method
* will not be smaller than the offset given at the previous invocation.
* </p>
*
* @param off The output stream offset at which to apply the correction
* @param cumulativeDiff The input offset is given by adding this
* to the output offset
*/
protected void addOffCorrectMap(int off, int cumulativeDiff) {
if (offsets == null) {
offsets = new int[64];
diffs = new int[64];
} else if (size == offsets.length) {
offsets = ArrayUtil.grow(offsets);
diffs = ArrayUtil.grow(diffs);
}
assert (size == 0 || off >= offsets[size - 1])
: "Offset #" + size + "(" + off + ") is less than the last recorded offset "
+ offsets[size - 1] + "\n" + Arrays.toString(offsets) + "\n" + Arrays.toString(diffs);
if (size == 0 || off != offsets[size - 1]) {
offsets[size] = off;
diffs[size++] = cumulativeDiff;
} else { // Overwrite the diff at the last recorded offset
diffs[size - 1] = cumulativeDiff;
}
}
}

162
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex

@ -0,0 +1,162 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CharacterEntities = ( "AElig" | "Aacute" | "Acirc" | "Agrave" | "Alpha"
| "Aring" | "Atilde" | "Auml" | "Beta" | "Ccedil" | "Chi"
| "Dagger" | "Delta" | "ETH" | "Eacute" | "Ecirc"
| "Egrave" | "Epsilon" | "Eta" | "Euml" | "Gamma"
| "Iacute" | "Icirc" | "Igrave" | "Iota" | "Iuml" | "Kappa"
| "Lambda" | "Mu" | "Ntilde" | "Nu" | "OElig" | "Oacute"
| "Ocirc" | "Ograve" | "Omega" | "Omicron" | "Oslash"
| "Otilde" | "Ouml" | "Phi" | "Pi" | "Prime" | "Psi"
| "Rho" | "Scaron" | "Sigma" | "THORN" | "Tau" | "Theta"
| "Uacute" | "Ucirc" | "Ugrave" | "Upsilon" | "Uuml" | "Xi"
| "Yacute" | "Yuml" | "Zeta" | "aacute" | "acirc" | "acute"
| "aelig" | "agrave" | "alefsym" | "alpha" | "amp" | "AMP"
| "and" | "ang" | "apos" | "aring" | "asymp" | "atilde"
| "auml" | "bdquo" | "beta" | "brvbar" | "bull" | "cap"
| "ccedil" | "cedil" | "cent" | "chi" | "circ" | "clubs"
| "cong" | "copy" | "COPY" | "crarr" | "cup" | "curren"
| "dArr" | "dagger" | "darr" | "deg" | "delta" | "diams"
| "divide" | "eacute" | "ecirc" | "egrave" | "empty"
| "emsp" | "ensp" | "epsilon" | "equiv" | "eta" | "eth"
| "euml" | "euro" | "exist" | "fnof" | "forall" | "frac12"
| "frac14" | "frac34" | "frasl" | "gamma" | "ge" | "gt"
| "GT" | "hArr" | "harr" | "hearts" | "hellip" | "iacute"
| "icirc" | "iexcl" | "igrave" | "image" | "infin" | "int"
| "iota" | "iquest" | "isin" | "iuml" | "kappa" | "lArr"
| "lambda" | "lang" | "laquo" | "larr" | "lceil" | "ldquo"
| "le" | "lfloor" | "lowast" | "loz" | "lrm" | "lsaquo"
| "lsquo" | "lt" | "LT" | "macr" | "mdash" | "micro"
| "middot" | "minus" | "mu" | "nabla" | "nbsp" | "ndash"
| "ne" | "ni" | "not" | "notin" | "nsub" | "ntilde" | "nu"
| "oacute" | "ocirc" | "oelig" | "ograve" | "oline"
| "omega" | "omicron" | "oplus" | "or" | "ordf" | "ordm"
| "oslash" | "otilde" | "otimes" | "ouml" | "para" | "part"
| "permil" | "perp" | "phi" | "pi" | "piv" | "plusmn"
| "pound" | "prime" | "prod" | "prop" | "psi" | "quot"
| "QUOT" | "rArr" | "radic" | "rang" | "raquo" | "rarr"
| "rceil" | "rdquo" | "real" | "reg" | "REG" | "rfloor"
| "rho" | "rlm" | "rsaquo" | "rsquo" | "sbquo" | "scaron"
| "sdot" | "sect" | "shy" | "sigma" | "sigmaf" | "sim"
| "spades" | "sub" | "sube" | "sum" | "sup" | "sup1"
| "sup2" | "sup3" | "supe" | "szlig" | "tau" | "there4"
| "theta" | "thetasym" | "thinsp" | "thorn" | "tilde"
| "times" | "trade" | "uArr" | "uacute" | "uarr" | "ucirc"
| "ugrave" | "uml" | "upsih" | "upsilon" | "uuml"
| "weierp" | "xi" | "yacute" | "yen" | "yuml" | "zeta"
| "zwj" | "zwnj" )
%{
private static final Map<String,String> upperCaseVariantsAccepted
= new HashMap<String,String>();
static {
upperCaseVariantsAccepted.put("quot", "QUOT");
upperCaseVariantsAccepted.put("copy", "COPY");
upperCaseVariantsAccepted.put("gt", "GT");
upperCaseVariantsAccepted.put("lt", "LT");
upperCaseVariantsAccepted.put("reg", "REG");
upperCaseVariantsAccepted.put("amp", "AMP");
}
private static final CharArrayMap<Character> entityValues
= new CharArrayMap<Character>(Version.LUCENE_40, 253, false);
static {
String[] entities = {
"AElig", "\u00C6", "Aacute", "\u00C1", "Acirc", "\u00C2",
"Agrave", "\u00C0", "Alpha", "\u0391", "Aring", "\u00C5",
"Atilde", "\u00C3", "Auml", "\u00C4", "Beta", "\u0392",
"Ccedil", "\u00C7", "Chi", "\u03A7", "Dagger", "\u2021",
"Delta", "\u0394", "ETH", "\u00D0", "Eacute", "\u00C9",
"Ecirc", "\u00CA", "Egrave", "\u00C8", "Epsilon", "\u0395",
"Eta", "\u0397", "Euml", "\u00CB", "Gamma", "\u0393", "Iacute", "\u00CD",
"Icirc", "\u00CE", "Igrave", "\u00CC", "Iota", "\u0399",
"Iuml", "\u00CF", "Kappa", "\u039A", "Lambda", "\u039B", "Mu", "\u039C",
"Ntilde", "\u00D1", "Nu", "\u039D", "OElig", "\u0152",
"Oacute", "\u00D3", "Ocirc", "\u00D4", "Ograve", "\u00D2",
"Omega", "\u03A9", "Omicron", "\u039F", "Oslash", "\u00D8",
"Otilde", "\u00D5", "Ouml", "\u00D6", "Phi", "\u03A6", "Pi", "\u03A0",
"Prime", "\u2033", "Psi", "\u03A8", "Rho", "\u03A1", "Scaron", "\u0160",
"Sigma", "\u03A3", "THORN", "\u00DE", "Tau", "\u03A4", "Theta", "\u0398",
"Uacute", "\u00DA", "Ucirc", "\u00DB", "Ugrave", "\u00D9",
"Upsilon", "\u03A5", "Uuml", "\u00DC", "Xi", "\u039E",
"Yacute", "\u00DD", "Yuml", "\u0178", "Zeta", "\u0396",
"aacute", "\u00E1", "acirc", "\u00E2", "acute", "\u00B4",
"aelig", "\u00E6", "agrave", "\u00E0", "alefsym", "\u2135",
"alpha", "\u03B1", "amp", "\u0026", "and", "\u2227", "ang", "\u2220",
"apos", "\u0027", "aring", "\u00E5", "asymp", "\u2248",
"atilde", "\u00E3", "auml", "\u00E4", "bdquo", "\u201E",
"beta", "\u03B2", "brvbar", "\u00A6", "bull", "\u2022", "cap", "\u2229",
"ccedil", "\u00E7", "cedil", "\u00B8", "cent", "\u00A2", "chi", "\u03C7",
"circ", "\u02C6", "clubs", "\u2663", "cong", "\u2245", "copy", "\u00A9",
"crarr", "\u21B5", "cup", "\u222A", "curren", "\u00A4", "dArr", "\u21D3",
"dagger", "\u2020", "darr", "\u2193", "deg", "\u00B0", "delta", "\u03B4",
"diams", "\u2666", "divide", "\u00F7", "eacute", "\u00E9",
"ecirc", "\u00EA", "egrave", "\u00E8", "empty", "\u2205",
"emsp", "\u2003", "ensp", "\u2002", "epsilon", "\u03B5",
"equiv", "\u2261", "eta", "\u03B7", "eth", "\u00F0", "euml", "\u00EB",
"euro", "\u20AC", "exist", "\u2203", "fnof", "\u0192",
"forall", "\u2200", "frac12", "\u00BD", "frac14", "\u00BC",
"frac34", "\u00BE", "frasl", "\u2044", "gamma", "\u03B3", "ge", "\u2265",
"gt", "\u003E", "hArr", "\u21D4", "harr", "\u2194", "hearts", "\u2665",
"hellip", "\u2026", "iacute", "\u00ED", "icirc", "\u00EE",
"iexcl", "\u00A1", "igrave", "\u00EC", "image", "\u2111",
"infin", "\u221E", "int", "\u222B", "iota", "\u03B9", "iquest", "\u00BF",
"isin", "\u2208", "iuml", "\u00EF", "kappa", "\u03BA", "lArr", "\u21D0",
"lambda", "\u03BB", "lang", "\u2329", "laquo", "\u00AB",
"larr", "\u2190", "lceil", "\u2308", "ldquo", "\u201C", "le", "\u2264",
"lfloor", "\u230A", "lowast", "\u2217", "loz", "\u25CA", "lrm", "\u200E",
"lsaquo", "\u2039", "lsquo", "\u2018", "lt", "\u003C", "macr", "\u00AF",
"mdash", "\u2014", "micro", "\u00B5", "middot", "\u00B7",
"minus", "\u2212", "mu", "\u03BC", "nabla", "\u2207", "nbsp", " ",
"ndash", "\u2013", "ne", "\u2260", "ni", "\u220B", "not", "\u00AC",
"notin", "\u2209", "nsub", "\u2284", "ntilde", "\u00F1", "nu", "\u03BD",
"oacute", "\u00F3", "ocirc", "\u00F4", "oelig", "\u0153",
"ograve", "\u00F2", "oline", "\u203E", "omega", "\u03C9",
"omicron", "\u03BF", "oplus", "\u2295", "or", "\u2228", "ordf", "\u00AA",
"ordm", "\u00BA", "oslash", "\u00F8", "otilde", "\u00F5",
"otimes", "\u2297", "ouml", "\u00F6", "para", "\u00B6", "part", "\u2202",
"permil", "\u2030", "perp", "\u22A5", "phi", "\u03C6", "pi", "\u03C0",
"piv", "\u03D6", "plusmn", "\u00B1", "pound", "\u00A3",
"prime", "\u2032", "prod", "\u220F", "prop", "\u221D", "psi", "\u03C8",
"quot", "\"", "rArr", "\u21D2", "radic", "\u221A", "rang", "\u232A",
"raquo", "\u00BB", "rarr", "\u2192", "rceil", "\u2309",
"rdquo", "\u201D", "real", "\u211C", "reg", "\u00AE", "rfloor", "\u230B",
"rho", "\u03C1", "rlm", "\u200F", "rsaquo", "\u203A", "rsquo", "\u2019",
"sbquo", "\u201A", "scaron", "\u0161", "sdot", "\u22C5",
"sect", "\u00A7", "shy", "\u00AD", "sigma", "\u03C3", "sigmaf", "\u03C2",
"sim", "\u223C", "spades", "\u2660", "sub", "\u2282", "sube", "\u2286",
"sum", "\u2211", "sup", "\u2283", "sup1", "\u00B9", "sup2", "\u00B2",
"sup3", "\u00B3", "supe", "\u2287", "szlig", "\u00DF", "tau", "\u03C4",
"there4", "\u2234", "theta", "\u03B8", "thetasym", "\u03D1",
"thinsp", "\u2009", "thorn", "\u00FE", "tilde", "\u02DC",
"times", "\u00D7", "trade", "\u2122", "uArr", "\u21D1",
"uacute", "\u00FA", "uarr", "\u2191", "ucirc", "\u00FB",
"ugrave", "\u00F9", "uml", "\u00A8", "upsih", "\u03D2",
"upsilon", "\u03C5", "uuml", "\u00FC", "weierp", "\u2118",
"xi", "\u03BE", "yacute", "\u00FD", "yen", "\u00A5", "yuml", "\u00FF",
"zeta", "\u03B6", "zwj", "\u200D", "zwnj", "\u200C"
};
for (int i = 0 ; i < entities.length ; i += 2) {
Character value = entities[i + 1].charAt(0);
entityValues.put(entities[i], value);
String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);
if (upperCaseVariant != null) {
entityValues.put(upperCaseVariant, value);
}
}
}
%}

64
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro

@ -0,0 +1,64 @@
/*
* Copyright 2010 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Generated using ICU4J 49.1.0.0 on Sunday, July 15, 2012 5:42:00 AM UTC
// by com.fr.third.org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros
ID_Start_Supp = (
[\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDD80-\uDDB7\uDDBE\uDDBF\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
| [\uD81A][\uDC00-\uDE38]
| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB]
| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
| [\uD82C][\uDC00\uDC01]
| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
| [\uD81B][\uDF00-\uDF44\uDF50\uDF93-\uDF9F]
| [\uD87E][\uDC00-\uDE1D]
| [\uD804][\uDC03-\uDC37\uDC83-\uDCAF\uDCD0-\uDCE8\uDD03-\uDD26\uDD83-\uDDB2\uDDC1-\uDDC4]
| [\uD83B][\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB]
| [\uD809][\uDC00-\uDC62]
| [\uD808][\uDC00-\uDF6E]
| [\uD803][\uDC00-\uDC48]
| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
| [\uD80D][\uDC00-\uDC2E]
| [\uD805][\uDE80-\uDEAA]
| [\uD86E][\uDC00-\uDC1D]
| [\uD801][\uDC00-\uDC9D]
)
ID_Continue_Supp = (
[\uD81A][\uDC00-\uDE38]
| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
| [\uD82C][\uDC00\uDC01]
| [\uD81B][\uDF00-\uDF44\uDF50-\uDF7E\uDF8F-\uDF9F]
| [\uD801][\uDC00-\uDC9D\uDCA0-\uDCA9]
| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
| [\uD87E][\uDC00-\uDE1D]
| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDD80-\uDDB7\uDDBE\uDDBF\uDE00-\uDE03\uDE05\uDE06\uDE0C-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE38-\uDE3A\uDE3F\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
| [\uD805][\uDE80-\uDEB7\uDEC0-\uDEC9]
| [\uD83B][\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB]
| [\uD809][\uDC00-\uDC62]
| [\uD808][\uDC00-\uDF6E]
| [\uD803][\uDC00-\uDC48]
| [\uD80D][\uDC00-\uDC2E]
| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDDFD\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
| [\uD804][\uDC00-\uDC46\uDC66-\uDC6F\uDC80-\uDCBA\uDCD0-\uDCE8\uDCF0-\uDCF9\uDD00-\uDD34\uDD36-\uDD3F\uDD80-\uDDC4\uDDD0-\uDDD9]
| [\uD86E][\uDC00-\uDC1D]
| [\uDB40][\uDD00-\uDDEF]
| [\uD834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44]
| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB\uDFCE-\uDFFF]
)

31821
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java

File diff suppressed because it is too large Load Diff

919
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex

@ -0,0 +1,919 @@
package com.fr.third.org.apache.lucene.analysis.charfilter;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import com.fr.third.org.apache.lucene.util.Version;
import com.fr.third.org.apache.lucene.analysis.util.CharArrayMap;
import com.fr.third.org.apache.lucene.analysis.util.CharArraySet;
import com.fr.third.org.apache.lucene.analysis.util.OpenStringBuilder;
/**
* A CharFilter that wraps another Reader and attempts to strip out HTML constructs.
*/
@SuppressWarnings("fallthrough")
%%
%unicode 6.1
%apiprivate
%type int
%final
%public
%char
%function nextChar
%class HTMLStripCharFilter
%extends BaseCharFilter
%xstate AMPERSAND, NUMERIC_CHARACTER, CHARACTER_REFERENCE_TAIL
%xstate LEFT_ANGLE_BRACKET, BANG, COMMENT, SCRIPT, SCRIPT_COMMENT
%xstate LEFT_ANGLE_BRACKET_SLASH, LEFT_ANGLE_BRACKET_SPACE, CDATA
%xstate SERVER_SIDE_INCLUDE, SINGLE_QUOTED_STRING, DOUBLE_QUOTED_STRING
%xstate END_TAG_TAIL_INCLUDE, END_TAG_TAIL_EXCLUDE, END_TAG_TAIL_SUBSTITUTE
%xstate START_TAG_TAIL_INCLUDE, START_TAG_TAIL_EXCLUDE, START_TAG_TAIL_SUBSTITUTE
%xstate STYLE, STYLE_COMMENT
// From XML 1.0 <http://www.w3.org/TR/xml/>:
//
// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [...]
// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | [...]
// [5] Name ::= NameStartChar (NameChar)*
//
// From UAX #31: Unicode Identifier and Pattern Syntax
// <http://unicode.org/reports/tr31/>:
//
// D1. Default Identifier Syntax
//
// <identifier> := <ID_Start> <ID_Continue>*
//
Name = ( ( [:_\p{ID_Start}] | {ID_Start_Supp} ) ( [-.:_\p{ID_Continue}] | {ID_Continue_Supp} )* )
// From Apache httpd mod_include documentation
// <http://httpd.apache.org/docs/current/mod/mod_include.html>:
//
// Basic Elements
//
// The document is parsed as an HTML document, with special commands
// embedded as SGML comments. A command has the syntax:
//
// <!--#element attribute=value attribute=value ... -->
//
// The value will often be enclosed in double quotes, but single quotes (')
// and backticks (`) are also possible. Many commands only allow a single
// attribute-value pair. Note that the comment terminator (-->) should be
// preceded by whitespace to ensure that it isn't considered part of an SSI
// token. Note that the leading <!--# is one token and may not contain any
// whitespaces.
//
EventAttributeSuffixes = ( [aA][bB][oO][rR][tT] |
[bB][lL][uU][rR] |
[cC][hH][aA][nN][gG][eE] |
[cC][lL][iI][cC][kK] |
[dD][bB][lL][cC][lL][iI][cC][kK] |
[eE][rR][rR][oO][rR] |
[fF][oO][cC][uU][sS] |
[kK][eE][yY][dD][oO][wW][nN] |
[kK][eE][yY][pP][rR][eE][sS][sS] |
[kK][eE][yY][uU][pP] |
[lL][oO][aA][dD] |
[mM][oO][uU][sS][eE][dD][oO][wW][nN] |
[mM][oO][uU][sS][eE][mM][oO][vV][eE] |
[mM][oO][uU][sS][eE][oO][uU][tT] |
[mM][oO][uU][sS][eE][oO][vV][eE][rR] |
[mM][oO][uU][sS][eE][uU][pP] |
[rR][eE][sS][eE][tT] |
[sS][eE][lL][eE][cC][tT] |
[sS][uU][bB][mM][iI][tT] |
[uU][nN][lL][oO][aA][dD] )
SingleQuoted = ( "'" ( "\\'" | [^']* )* "'" )
DoubleQuoted = ( "\"" ( "\\\"" | [^\"]* )* "\"" )
ServerSideInclude = ( "<!--#" ( [^'\"] | {SingleQuoted} | {DoubleQuoted} )* "-->" )
EventAttribute = [oO][nN] {EventAttributeSuffixes} \s* "=" \s* ( {SingleQuoted} | {DoubleQuoted} )
OpenTagContent = ( {EventAttribute} | [^<>] | {ServerSideInclude} )*
InlineElment = ( [aAbBiIqQsSuU] |
[aA][bB][bB][rR] |
[aA][cC][rR][oO][nN][yY][mM] |
[bB][aA][sS][eE][fF][oO][nN][tT] |
[bB][dD][oO] |
[bB][iI][gG] |
[cC][iI][tT][eE] |
[cC][oO][dD][eE] |
[dD][fF][nN] |
[eE][mM] |
[fF][oO][nN][tT] |
[iI][mM][gG] |
[iI][nN][pP][uU][tT] |
[kK][bB][dD] |
[lL][aA][bB][eE][lL] |
[sS][aA][mM][pP] |
[sS][eE][lL][eE][cC][tT] |
[sS][mM][aA][lL][lL] |
[sS][pP][aA][nN] |
[sS][tT][rR][iI][kK][eE] |
[sS][tT][rR][oO][nN][gG] |
[sS][uU][bB] |
[sS][uU][pP] |
[tT][eE][xX][tT][aA][rR][eE][aA] |
[tT][tT] |
[vV][aA][rR] )
%include HTMLCharacterEntities.jflex
%include HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
%{
private static final int INITIAL_INPUT_SEGMENT_SIZE = 1024;
private static final char BLOCK_LEVEL_START_TAG_REPLACEMENT = '\n';
private static final char BLOCK_LEVEL_END_TAG_REPLACEMENT = '\n';
private static final char BR_START_TAG_REPLACEMENT = '\n';
private static final char BR_END_TAG_REPLACEMENT = '\n';
private static final char SCRIPT_REPLACEMENT = '\n';
private static final char STYLE_REPLACEMENT = '\n';
private static final char REPLACEMENT_CHARACTER = '\uFFFD';
private CharArraySet escapedTags = null;
private int inputStart;
private int cumulativeDiff;
private boolean escapeBR = false;
private boolean escapeSCRIPT = false;
private boolean escapeSTYLE = false;
private int restoreState;
private int previousRestoreState;
private int outputCharCount;
private int eofReturnValue;
private TextSegment inputSegment
= new TextSegment(INITIAL_INPUT_SEGMENT_SIZE);
private TextSegment outputSegment = inputSegment;
private TextSegment entitySegment = new TextSegment(2);
/**
* Creates a new HTMLStripCharFilter over the provided Reader.
* @param source Reader to strip html tags from.
*/
public HTMLStripCharFilter(Reader source) {
super(source);
this.zzReader = source;
}
/**
* Creates a new HTMLStripCharFilter over the provided Reader
* with the specified start and end tags.
* @param source Reader to strip html tags from.
* @param escapedTags Tags in this set (both start and end tags)
* will not be filtered out.
*/
public HTMLStripCharFilter(Reader source, Set<String> escapedTags) {
super(source);
this.zzReader = source;
if (null != escapedTags) {
for (String tag : escapedTags) {
if (tag.equalsIgnoreCase("BR")) {
escapeBR = true;
} else if (tag.equalsIgnoreCase("SCRIPT")) {
escapeSCRIPT = true;
} else if (tag.equalsIgnoreCase("STYLE")) {
escapeSTYLE = true;
} else {
if (null == this.escapedTags) {
this.escapedTags = new CharArraySet(Version.LUCENE_40, 16, true);
}
this.escapedTags.add(tag);
}
}
}
}
@Override
public int read() throws IOException {
if (outputSegment.isRead()) {
if (zzAtEOF) {
return -1;
}
int ch = nextChar();
++outputCharCount;
return ch;
}
int ch = outputSegment.nextChar();
++outputCharCount;
return ch;
}
@Override
public int read(char cbuf[], int off, int len) throws IOException {
int i = 0;
for ( ; i < len ; ++i) {
int ch = read();
if (ch == -1) break;
cbuf[off++] = (char)ch;
}
return i > 0 ? i : (len == 0 ? 0 : -1);
}
@Override
public void close() throws IOException {
yyclose();
}
static int getInitialBufferSize() { // Package private, for testing purposes
return ZZ_BUFFERSIZE;
}
private class TextSegment extends OpenStringBuilder {
/** The position from which the next char will be read. */
int pos = 0;
/** Wraps the given buffer and sets this.len to the given length. */
TextSegment(char[] buffer, int length) {
super(buffer, length);
}
/** Allocates an internal buffer of the given size. */
TextSegment(int size) {
super(size);
}
/** Sets len = 0 and pos = 0. */
void clear() {
reset();
restart();
}
/** Sets pos = 0 */
void restart() {
pos = 0;
}
/** Returns the next char in the segment. */
int nextChar() {
assert (! isRead()): "Attempting to read past the end of a segment.";
return buf[pos++];
}
/** Returns true when all characters in the text segment have been read */
boolean isRead() {
return pos >= len;
}
}
%}
%eofval{
return eofReturnValue;
%eofval}
%eof{
switch (zzLexicalState) {
case SCRIPT:
case COMMENT:
case SCRIPT_COMMENT:
case STYLE:
case STYLE_COMMENT:
case SINGLE_QUOTED_STRING:
case DOUBLE_QUOTED_STRING:
case END_TAG_TAIL_EXCLUDE:
case END_TAG_TAIL_SUBSTITUTE:
case START_TAG_TAIL_EXCLUDE:
case SERVER_SIDE_INCLUDE:
case START_TAG_TAIL_SUBSTITUTE: { // Exclude
// add (length of input that won't be output) [ - (substitution length) = 0 ]
cumulativeDiff += yychar - inputStart;
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
outputSegment.clear();
eofReturnValue = -1;
break;
}
case CHARACTER_REFERENCE_TAIL: { // Substitute
// At end of file, allow char refs without semicolons
// add (length of input that won't be output) - (substitution length)
cumulativeDiff += inputSegment.length() - outputSegment.length();
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
eofReturnValue = outputSegment.nextChar();
break;
}
case BANG:
case CDATA:
case AMPERSAND:
case NUMERIC_CHARACTER:
case END_TAG_TAIL_INCLUDE:
case START_TAG_TAIL_INCLUDE:
case LEFT_ANGLE_BRACKET:
case LEFT_ANGLE_BRACKET_SLASH:
case LEFT_ANGLE_BRACKET_SPACE: { // Include
outputSegment = inputSegment;
eofReturnValue = outputSegment.nextChar();
break;
}
default: {
eofReturnValue = -1;
}
}
%eof}
%%
"&" {
inputStart = yychar;
inputSegment.clear();
inputSegment.append('&');
yybegin(AMPERSAND);
}
"<" {
inputStart = yychar;
inputSegment.clear();
inputSegment.append('<');
yybegin(LEFT_ANGLE_BRACKET);
}
<AMPERSAND> {
{CharacterEntities} {
int length = yylength();
inputSegment.write(zzBuffer, zzStartRead, length);
entitySegment.clear();
char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue();
entitySegment.append(ch);
outputSegment = entitySegment;
yybegin(CHARACTER_REFERENCE_TAIL);
}
"#" { inputSegment.append('#'); yybegin(NUMERIC_CHARACTER); }
// 1 1 11 11
// 0 1 2 3 45 678 9 0 1 23 45
"#" [xX][dD][89aAbB][0-9a-fA-F]{2} ";&#" [xX][dD][c-fC-F][0-9a-fA-F]{2} ";" {
// Handle paired UTF-16 surrogates.
outputSegment = entitySegment;
outputSegment.clear();
String surrogatePair = yytext();
char highSurrogate = '\u0000';
try {
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
} catch(Exception e) { // should never happen
assert false: "Exception parsing high surrogate '"
+ surrogatePair.substring(2, 6) + "'";
}
try {
outputSegment.unsafeWrite
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(10, 14) + "'";
}
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
// 1 1 11 11
// 01 2 345 678 9 0 1 23 45
"#5" [56] \d{3} ";&#" [xX][dD][c-fC-F][0-9a-fA-F]{2} ";" {
// Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
char highSurrogate = '\u0000';
try { // High surrogates are in decimal range [55296, 56319]
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
} catch(Exception e) { // should never happen
assert false: "Exception parsing high surrogate '"
+ surrogatePair.substring(1, 6) + "'";
}
if (Character.isHighSurrogate(highSurrogate)) {
outputSegment = entitySegment;
outputSegment.clear();
try {
outputSegment.unsafeWrite
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(10, 14) + "'";
}
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
yypushback(surrogatePair.length() - 1); // Consume only '#'
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
}
// 1 111 11
// 0 1 2 3 45 6789 0 123 45
"#" [xX][dD][89aAbB][0-9a-fA-F]{2} ";&#5" [67] \d{3} ";" {
// Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
char highSurrogate = '\u0000';
char lowSurrogate = '\u0000';
try {
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
} catch(Exception e) { // should never happen
assert false: "Exception parsing high surrogate '"
+ surrogatePair.substring(2, 6) + "'";
}
try { // Low surrogates are in decimal range [56320, 57343]
lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(9, 14) + "'";
}
if (Character.isLowSurrogate(lowSurrogate)) {
outputSegment = entitySegment;
outputSegment.clear();
outputSegment.unsafeWrite(lowSurrogate);
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
yypushback(surrogatePair.length() - 1); // Consume only '#'
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
}
// 1 111 11
// 01 2 345 6789 0 123 45
"#5" [56] \d{3} ";&#5" [67] \d{3} ";" {
// Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
char highSurrogate = '\u0000';
try { // High surrogates are in decimal range [55296, 56319]
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
} catch(Exception e) { // should never happen
assert false: "Exception parsing high surrogate '"
+ surrogatePair.substring(1, 6) + "'";
}
if (Character.isHighSurrogate(highSurrogate)) {
char lowSurrogate = '\u0000';
try { // Low surrogates are in decimal range [56320, 57343]
lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(9, 14) + "'";
}
if (Character.isLowSurrogate(lowSurrogate)) {
outputSegment = entitySegment;
outputSegment.clear();
outputSegment.unsafeWrite(lowSurrogate);
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
}
yypushback(surrogatePair.length() - 1); // Consume only '#'
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
}
}
<NUMERIC_CHARACTER> {
[xX] [0-9A-Fa-f]+ {
int matchLength = yylength();
inputSegment.write(zzBuffer, zzStartRead, matchLength);
if (matchLength <= 6) { // 10FFFF: max 6 hex chars
String hexCharRef
= new String(zzBuffer, zzStartRead + 1, matchLength - 1);
int codePoint = 0;
try {
codePoint = Integer.parseInt(hexCharRef, 16);
} catch(Exception e) {
assert false: "Exception parsing hex code point '" + hexCharRef + "'";
}
if (codePoint <= 0x10FFFF) {
outputSegment = entitySegment;
outputSegment.clear();
if (codePoint >= Character.MIN_SURROGATE
&& codePoint <= Character.MAX_SURROGATE) {
outputSegment.unsafeWrite(REPLACEMENT_CHARACTER);
} else {
outputSegment.setLength
(Character.toChars(codePoint, outputSegment.getArray(), 0));
}
yybegin(CHARACTER_REFERENCE_TAIL);
} else {
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
} else {
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
}
[0-9]+ {
int matchLength = yylength();
inputSegment.write(zzBuffer, zzStartRead, matchLength);
if (matchLength <= 7) { // 0x10FFFF = 1114111: max 7 decimal chars
String decimalCharRef = yytext();
int codePoint = 0;
try {
codePoint = Integer.parseInt(decimalCharRef);
} catch(Exception e) {
assert false: "Exception parsing code point '" + decimalCharRef + "'";
}
if (codePoint <= 0x10FFFF) {
outputSegment = entitySegment;
outputSegment.clear();
if (codePoint >= Character.MIN_SURROGATE
&& codePoint <= Character.MAX_SURROGATE) {
outputSegment.unsafeWrite(REPLACEMENT_CHARACTER);
} else {
outputSegment.setLength
(Character.toChars(codePoint, outputSegment.getArray(), 0));
}
yybegin(CHARACTER_REFERENCE_TAIL);
} else {
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
} else {
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
}
}
<CHARACTER_REFERENCE_TAIL> {
";" {
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
}
<LEFT_ANGLE_BRACKET_SLASH> {
\s+ { inputSegment.write(zzBuffer, zzStartRead, yylength()); }
[bB][rR] \s* ">" {
yybegin(YYINITIAL);
if (escapeBR) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
return outputSegment.nextChar();
} else {
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.reset();
return BR_END_TAG_REPLACEMENT;
}
}
{InlineElment} {
inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(END_TAG_TAIL_INCLUDE);
} else {
yybegin(END_TAG_TAIL_EXCLUDE);
}
}
{Name} {
inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(END_TAG_TAIL_INCLUDE);
} else {
yybegin(END_TAG_TAIL_SUBSTITUTE);
}
}
}
<END_TAG_TAIL_INCLUDE> {
\s* ">" {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
}
<END_TAG_TAIL_EXCLUDE> {
\s* ">" {
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
}
}
<END_TAG_TAIL_SUBSTITUTE> {
\s* ">" {
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return BLOCK_LEVEL_END_TAG_REPLACEMENT;
}
}
<LEFT_ANGLE_BRACKET> {
"!" { inputSegment.append('!'); yybegin(BANG); }
"/" { inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH); }
\s+ {
inputSegment.write(zzBuffer, zzStartRead, yylength());
yybegin(LEFT_ANGLE_BRACKET_SPACE);
}
"?" [^>]* [/?] ">" {
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
}
\s* [bB][rR] ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
yybegin(YYINITIAL);
if (escapeBR) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
return outputSegment.nextChar();
} else {
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.reset();
return BR_START_TAG_REPLACEMENT;
}
}
\s* [sS][cC][rR][iI][pP][tT] ( \s+ {OpenTagContent} )? \s* ">" {
yybegin(SCRIPT);
if (escapeSCRIPT) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
inputStart += 1 + yylength();
return outputSegment.nextChar();
}
}
\s* [sS][tT][yY][lL][eE] ( \s+ {OpenTagContent} )? \s* ">" {
yybegin(STYLE);
if (escapeSTYLE) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
inputStart += 1 + yylength();
return outputSegment.nextChar();
}
}
}
<LEFT_ANGLE_BRACKET, LEFT_ANGLE_BRACKET_SPACE> {
{InlineElment} {
inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(START_TAG_TAIL_INCLUDE);
} else {
yybegin(START_TAG_TAIL_EXCLUDE);
}
}
{Name} {
inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(START_TAG_TAIL_INCLUDE);
} else {
yybegin(START_TAG_TAIL_SUBSTITUTE);
}
}
}
<START_TAG_TAIL_INCLUDE> {
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
}
<START_TAG_TAIL_EXCLUDE> {
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
outputSegment = inputSegment;
yybegin(YYINITIAL);
}
}
<START_TAG_TAIL_SUBSTITUTE> {
( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return BLOCK_LEVEL_START_TAG_REPLACEMENT;
}
}
<BANG> {
"--" { yybegin(COMMENT); }
">" {
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
}
// From XML 1.0 <http://www.w3.org/TR/xml/>:
//
// [18] CDSect ::= CDStart CData CDEnd
// [19] CDStart ::= '<![CDATA['
// [20] CData ::= (Char* - (Char* ']]>' Char*))
// [21] CDEnd ::= ']]>'
//
"[CDATA[" {
// add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(CDATA);
}
[^] {
inputSegment.append(zzBuffer[zzStartRead]);
}
}
<CDATA> {
"]]>" {
// add (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
yybegin(YYINITIAL);
}
[^] { return zzBuffer[zzStartRead]; }
}
<COMMENT> {
"<!--#" { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
"-->" {
// add (previously matched input length) + (this match length) [ - (substitution length) = 0]
cumulativeDiff += yychar - inputStart + yylength();
// position the correction at (already output length) [ + (substitution length) = 0]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
}
[^] { }
}
<SERVER_SIDE_INCLUDE> {
"-->" { yybegin(restoreState); }
"'" {
previousRestoreState = restoreState;
restoreState = SERVER_SIDE_INCLUDE;
yybegin(SINGLE_QUOTED_STRING);
}
"\"" {
previousRestoreState = restoreState;
restoreState = SERVER_SIDE_INCLUDE;
yybegin(DOUBLE_QUOTED_STRING);
}
[^] { }
}
<SCRIPT_COMMENT> {
"<!--#" { restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
"'" { restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING); }
"\"" { restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING); }
"-->" { yybegin(SCRIPT); }
[^] { }
}
<STYLE_COMMENT> {
"<!--#" { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
"'" { restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING); }
"\"" { restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING); }
"-->" { yybegin(STYLE); }
[^] { }
}
<SINGLE_QUOTED_STRING> {
"\\" [^] { }
"'" { yybegin(restoreState); restoreState = previousRestoreState; }
[^] { }
}
<DOUBLE_QUOTED_STRING> {
"\\" [^] { }
"\"" { yybegin(restoreState); restoreState = previousRestoreState; }
[^] { }
}
<SCRIPT> {
"<!--" { yybegin(SCRIPT_COMMENT); }
"</" \s* [sS][cC][rR][iI][pP][tT] \s* ">" {
inputSegment.clear();
yybegin(YYINITIAL);
// add (previously matched input length) -- current match and substitution handled below
cumulativeDiff += yychar - inputStart;
// position at (already output length) -- substitution handled below
int offsetCorrectionPos = outputCharCount;
int returnValue;
if (escapeSCRIPT) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
// add (this match length) - (substitution length)
cumulativeDiff += yylength() - 1;
// add (substitution length)
++offsetCorrectionPos;
returnValue = SCRIPT_REPLACEMENT;
}
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
return returnValue;
}
[^] { }
}
<STYLE> {
"<!--" { yybegin(STYLE_COMMENT); }
"</" \s* [sS][tT][yY][lL][eE] \s* ">" {
inputSegment.clear();
yybegin(YYINITIAL);
// add (previously matched input length) -- current match and substitution handled below
cumulativeDiff += yychar - inputStart;
// position the offset correction at (already output length) -- substitution handled below
int offsetCorrectionPos = outputCharCount;
int returnValue;
if (escapeSTYLE) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
// add (this match length) - (substitution length)
cumulativeDiff += yylength() - 1;
// add (substitution length)
++offsetCorrectionPos;
returnValue = STYLE_REPLACEMENT;
}
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
return returnValue;
}
[^] { }
}
<AMPERSAND,NUMERIC_CHARACTER,CHARACTER_REFERENCE_TAIL,LEFT_ANGLE_BRACKET_SLASH,END_TAG_TAIL_INCLUDE,END_TAG_TAIL_EXCLUDE,END_TAG_TAIL_SUBSTITUTE,LEFT_ANGLE_BRACKET,LEFT_ANGLE_BRACKET_SPACE,START_TAG_TAIL_INCLUDE,START_TAG_TAIL_EXCLUDE,START_TAG_TAIL_SUBSTITUTE,BANG> {
[^] {
yypushback(1);
outputSegment = inputSegment;
outputSegment.restart();
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
}
[^] { return zzBuffer[zzStartRead]; }

70
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterFactory.java

@ -0,0 +1,70 @@
package com.fr.third.org.apache.lucene.analysis.charfilter;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.fr.third.org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
import com.fr.third.org.apache.lucene.analysis.util.CharFilterFactory;
import java.io.Reader;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Factory for {@link HTMLStripCharFilter}.
* <pre class="prettyprint" >
* &lt;fieldType name="text_html" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;charFilter class="solr.HTMLStripCharFilterFactory" escapedTags="a, title" /&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
*/
public class HTMLStripCharFilterFactory extends CharFilterFactory {
Set<String> escapedTags = null;
Pattern TAG_NAME_PATTERN = Pattern.compile("[^\\s,]+");
public HTMLStripCharFilter create(Reader input) {
HTMLStripCharFilter charFilter;
if (null == escapedTags) {
charFilter = new HTMLStripCharFilter(input);
} else {
charFilter = new HTMLStripCharFilter(input, escapedTags);
}
return charFilter;
}
@Override
public void init(Map<String,String> args) {
super.init(args);
String escapedTagsArg = args.get("escapedTags");
if (null != escapedTagsArg) {
Matcher matcher = TAG_NAME_PATTERN.matcher(escapedTagsArg);
while (matcher.find()) {
if (null == escapedTags) {
escapedTags = new HashSet<String>();
}
escapedTags.add(matcher.group(0));
}
}
}
}

191
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/MappingCharFilter.java

@ -0,0 +1,191 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.fr.third.org.apache.lucene.analysis.charfilter;
import java.io.IOException;
import java.io.Reader;
import java.util.Map;
import com.fr.third.org.apache.lucene.analysis.CharFilter; // javadocs
import com.fr.third.org.apache.lucene.analysis.util.RollingCharBuffer;
import com.fr.third.org.apache.lucene.util.CharsRef;
import com.fr.third.org.apache.lucene.util.fst.CharSequenceOutputs;
import com.fr.third.org.apache.lucene.util.fst.FST;
import com.fr.third.org.apache.lucene.util.fst.Outputs;
/**
* Simplistic {@link CharFilter} that applies the mappings
* contained in a {@link NormalizeCharMap} to the character
* stream, and correcting the resulting changes to the
* offsets. Matching is greedy (longest pattern matching at
* a given point wins). Replacement is allowed to be the
* empty string.
*/
public class MappingCharFilter extends BaseCharFilter {
private final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
private final FST<CharsRef> map;
private final FST.BytesReader fstReader;
private final RollingCharBuffer buffer = new RollingCharBuffer();
private final FST.Arc<CharsRef> scratchArc = new FST.Arc<CharsRef>();
private final Map<Character,FST.Arc<CharsRef>> cachedRootArcs;
private CharsRef replacement;
private int replacementPointer;
private int inputOff;
/** Default constructor that takes a {@link Reader}. */
public MappingCharFilter(NormalizeCharMap normMap, Reader in) {
super(in);
buffer.reset(in);
map = normMap.map;
cachedRootArcs = normMap.cachedRootArcs;
if (map != null) {
fstReader = map.getBytesReader(0);
} else {
fstReader = null;
}
}
@Override
public void reset() throws IOException {
input.reset();
buffer.reset(input);
replacement = null;
inputOff = 0;
}
@Override
public int read() throws IOException {
//System.out.println("\nread");
while(true) {
if (replacement != null && replacementPointer < replacement.length) {
//System.out.println(" return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]);
return replacement.chars[replacement.offset + replacementPointer++];
}
// TODO: a more efficient approach would be Aho/Corasick's
// algorithm
// (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm)
// or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
//
// I think this would be (almost?) equivalent to 1) adding
// epsilon arcs from all final nodes back to the init
// node in the FST, 2) adding a .* (skip any char)
// loop on the initial node, and 3) determinizing
// that. Then we would not have to restart matching
// at each position.
int lastMatchLen = -1;
CharsRef lastMatch = null;
final int firstCH = buffer.get(inputOff);
if (firstCH != -1) {
FST.Arc<CharsRef> arc = cachedRootArcs.get(Character.valueOf((char) firstCH));
if (arc != null) {
if (!FST.targetHasArcs(arc)) {
// Fast pass for single character match:
assert arc.isFinal();
lastMatchLen = 1;
lastMatch = arc.output;
} else {
int lookahead = 0;
CharsRef output = arc.output;
while (true) {
lookahead++;
if (arc.isFinal()) {
// Match! (to node is final)
lastMatchLen = lookahead;
lastMatch = outputs.add(output, arc.nextFinalOutput);
// Greedy: keep searching to see if there's a
// longer match...
}
if (!FST.targetHasArcs(arc)) {
break;
}
int ch = buffer.get(inputOff + lookahead);
if (ch == -1) {
break;
}
if ((arc = map.findTargetArc(ch, arc, scratchArc, fstReader)) == null) {
// Dead end
break;
}
output = outputs.add(output, arc.output);
}
}
}
}
if (lastMatch != null) {
inputOff += lastMatchLen;
//System.out.println(" match! len=" + lastMatchLen + " repl=" + lastMatch);
final int diff = lastMatchLen - lastMatch.length;
if (diff != 0) {
final int prevCumulativeDiff = getLastCumulativeDiff();
if (diff > 0) {
// Replacement is shorter than matched input:
addOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff);
} else {
// Replacement is longer than matched input: remap
// the "extra" chars all back to the same input
// offset:
final int outputStart = inputOff - prevCumulativeDiff;
for(int extraIDX=0;extraIDX<-diff;extraIDX++) {
addOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1);
}
}
}
replacement = lastMatch;
replacementPointer = 0;
} else {
final int ret = buffer.get(inputOff);
if (ret != -1) {
inputOff++;
buffer.freeBefore(inputOff);
}
return ret;
}
}
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
int numRead = 0;
for(int i = off; i < off + len; i++) {
int c = read();
if (c == -1) break;
cbuf[i] = (char) c;
numRead++;
}
return numRead == 0 ? -1 : numRead;
}
}

135
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/MappingCharFilterFactory.java

@ -0,0 +1,135 @@
package com.fr.third.org.apache.lucene.analysis.charfilter;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.fr.third.org.apache.lucene.analysis.charfilter.MappingCharFilter;
import com.fr.third.org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import com.fr.third.org.apache.lucene.analysis.util.*;
/**
* Factory for {@link MappingCharFilter}.
* <pre class="prettyprint" >
* &lt;fieldType name="text_map" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;charFilter class="solr.MappingCharFilterFactory" mapping="mapping.txt"/&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
*
* @since Solr 1.4
*
*/
public class MappingCharFilterFactory extends CharFilterFactory implements
ResourceLoaderAware, MultiTermAwareComponent {
protected NormalizeCharMap normMap;
private String mapping;
// TODO: this should use inputstreams from the loader, not File!
public void inform(ResourceLoader loader) throws IOException {
mapping = args.get("mapping");
if (mapping != null) {
List<String> wlist = null;
File mappingFile = new File(mapping);
if (mappingFile.exists()) {
wlist = getLines(loader, mapping);
} else {
List<String> files = splitFileNames(mapping);
wlist = new ArrayList<String>();
for (String file : files) {
List<String> lines = getLines(loader, file.trim());
wlist.addAll(lines);
}
}
final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
parseRules(wlist, builder);
normMap = builder.build();
if (normMap.map == null) {
// if the inner FST is null, it means it accepts nothing (e.g. the file is empty)
// so just set the whole map to null
normMap = null;
}
}
}
public Reader create(Reader input) {
// if the map is null, it means there's actually no mappings... just return the original stream
// as there is nothing to do here.
return normMap == null ? input : new MappingCharFilter(normMap,input);
}
// "source" => "target"
static Pattern p = Pattern.compile( "\"(.*)\"\\s*=>\\s*\"(.*)\"\\s*$" );
protected void parseRules( List<String> rules, NormalizeCharMap.Builder builder ){
for( String rule : rules ){
Matcher m = p.matcher( rule );
if( !m.find() )
throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "], file = " + mapping);
builder.add( parseString( m.group( 1 ) ), parseString( m.group( 2 ) ) );
}
}
char[] out = new char[256];
protected String parseString( String s ){
int readPos = 0;
int len = s.length();
int writePos = 0;
while( readPos < len ){
char c = s.charAt( readPos++ );
if( c == '\\' ){
if( readPos >= len )
throw new IllegalArgumentException("Invalid escaped char in [" + s + "]");
c = s.charAt( readPos++ );
switch( c ) {
case '\\' : c = '\\'; break;
case '"' : c = '"'; break;
case 'n' : c = '\n'; break;
case 't' : c = '\t'; break;
case 'r' : c = '\r'; break;
case 'b' : c = '\b'; break;
case 'f' : c = '\f'; break;
case 'u' :
if( readPos + 3 >= len )
throw new IllegalArgumentException("Invalid escaped char in [" + s + "]");
c = (char)Integer.parseInt( s.substring( readPos, readPos + 4 ), 16 );
readPos += 4;
break;
}
}
out[writePos++] = c;
}
return new String( out, 0, writePos );
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
}
}

127
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java

@ -0,0 +1,127 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.fr.third.org.apache.lucene.analysis.charfilter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
import com.fr.third.org.apache.lucene.util.CharsRef;
import com.fr.third.org.apache.lucene.util.IntsRef;
import com.fr.third.org.apache.lucene.util.fst.Builder;
import com.fr.third.org.apache.lucene.util.fst.CharSequenceOutputs;
import com.fr.third.org.apache.lucene.util.fst.FST;
import com.fr.third.org.apache.lucene.util.fst.Outputs;
import com.fr.third.org.apache.lucene.util.fst.Util;
// TODO: save/load?
/**
* Holds a map of String input to String output, to be used
* with {@link MappingCharFilter}. Use the {@link Builder}
* to create this.
*/
public class NormalizeCharMap {
final FST<CharsRef> map;
final Map<Character,FST.Arc<CharsRef>> cachedRootArcs = new HashMap<Character,FST.Arc<CharsRef>>();
// Use the builder to create:
private NormalizeCharMap(FST<CharsRef> map) {
this.map = map;
if (map != null) {
try {
// Pre-cache root arcs:
final FST.Arc<CharsRef> scratchArc = new FST.Arc<CharsRef>();
final FST.BytesReader fstReader = map.getBytesReader(0);
map.getFirstArc(scratchArc);
if (FST.targetHasArcs(scratchArc)) {
map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader);
while(true) {
assert scratchArc.label != FST.END_LABEL;
cachedRootArcs.put(Character.valueOf((char) scratchArc.label), new FST.Arc<CharsRef>().copyFrom(scratchArc));
if (scratchArc.isLast()) {
break;
}
map.readNextRealArc(scratchArc, fstReader);
}
}
//System.out.println("cached " + cachedRootArcs.size() + " root arcs");
} catch (IOException ioe) {
// Bogus FST IOExceptions!! (will never happen)
throw new RuntimeException(ioe);
}
}
}
/**
* Builds an NormalizeCharMap.
* <p>
* Call add() until you have added all the mappings, then call build() to get a NormalizeCharMap
* @lucene.experimental
*/
public static class Builder {
private final Map<String,String> pendingPairs = new TreeMap<String,String>();
/** Records a replacement to be applied to the input
* stream. Whenever <code>singleMatch</code> occurs in
* the input, it will be replaced with
* <code>replacement</code>.
*
* @param match input String to be replaced
* @param replacement output String
* @throws IllegalArgumentException if
* <code>match</code> is the empty string, or was
* already previously added
*/
public void add(String match, String replacement) {
if (match.length() == 0 ){
throw new IllegalArgumentException("cannot match the empty string");
}
if (pendingPairs.containsKey(match)) {
throw new IllegalArgumentException("match \"" + match + "\" was already added");
}
pendingPairs.put(match, replacement);
}
/** Builds the NormalizeCharMap; call this once you
* are done calling {@link #add}. */
public NormalizeCharMap build() {
final FST<CharsRef> map;
try {
final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
final com.fr.third.org.apache.lucene.util.fst.Builder<CharsRef> builder = new com.fr.third.org.apache.lucene.util.fst.Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs);
final IntsRef scratch = new IntsRef();
for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
builder.add(Util.toUTF16(ent.getKey(), scratch),
new CharsRef(ent.getValue()));
}
map = builder.finish();
pendingPairs.clear();
} catch (IOException ioe) {
// Bogus FST IOExceptions!! (will never happen)
throw new RuntimeException(ioe);
}
return new NormalizeCharMap(map);
}
}
}

539
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/htmlentity.py

@ -0,0 +1,539 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
# A simple python script to generate an HTML entity map and a regex alternation
# for inclusion in HTMLStripCharFilter.jflex.
def main():
print get_apache_license()
codes = {}
regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
for line in get_entity_text().split('\n'):
match = regex.match(line)
if match:
key = match.group(1)
if key == 'quot': codes[key] = r'\"'
elif key == 'nbsp': codes[key] = ' ';
else : codes[key] = r'\u%04X' % int(match.group(2))
keys = sorted(codes)
first_entry = True
output_line = 'CharacterEntities = ( '
for key in keys:
new_entry = ('"%s"' if first_entry else ' | "%s"') % key
first_entry = False
if len(output_line) + len(new_entry) >= 80:
print output_line
output_line = ' '
output_line += new_entry
if key in ('quot','copy','gt','lt','reg','amp'):
new_entry = ' | "%s"' % key.upper()
if len(output_line) + len(new_entry) >= 80:
print output_line
output_line = ' '
output_line += new_entry
print output_line, ')'
print '%{'
print ' private static final Map<String,String> upperCaseVariantsAccepted'
print ' = new HashMap<String,String>();'
print ' static {'
print ' upperCaseVariantsAccepted.put("quot", "QUOT");'
print ' upperCaseVariantsAccepted.put("copy", "COPY");'
print ' upperCaseVariantsAccepted.put("gt", "GT");'
print ' upperCaseVariantsAccepted.put("lt", "LT");'
print ' upperCaseVariantsAccepted.put("reg", "REG");'
print ' upperCaseVariantsAccepted.put("amp", "AMP");'
print ' }'
print ' private static final CharArrayMap<Character> entityValues'
print ' = new CharArrayMap<Character>(Version.LUCENE_40, %i, false);' % len(keys)
print ' static {'
print ' String[] entities = {'
output_line = ' '
for key in keys:
new_entry = ' "%s", "%s",' % (key, codes[key])
if len(output_line) + len(new_entry) >= 80:
print output_line
output_line = ' '
output_line += new_entry
print output_line[:-1]
print ' };'
print ' for (int i = 0 ; i < entities.length ; i += 2) {'
print ' Character value = entities[i + 1].charAt(0);'
print ' entityValues.put(entities[i], value);'
print ' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);'
print ' if (upperCaseVariant != null) {'
print ' entityValues.put(upperCaseVariant, value);'
print ' }'
print ' }'
print " }"
print "%}"
def get_entity_text():
# The text below is taken verbatim from
# <http://www.w3.org/TR/REC-html40/sgml/entities.html>:
text = r"""
F.1. XHTML Character Entities
XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section.
F.1.1. XHTML Latin 1 Character Entities
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-lat1.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent.
<!-- ...................................................................... -->
<!-- XML-compatible ISO Latin 1 Character Entity Set for XHTML ............ -->
<!-- file: xhtml-lat1.ent
Typical invocation:
<!ENTITY % xhtml-lat1
PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
"xhtml-lat1.ent" >
%xhtml-lat1;
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent"
Revision: $Id: xhtml-lat1.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
Portions (C) International Organization for Standardization 1986:
Permission to copy in any form is granted for use with conforming
SGML systems and applications as defined in ISO 8879, provided
this notice is included in all copies.
-->
<!ENTITY nbsp "&#160;" ><!-- no-break space = non-breaking space, U+00A0 ISOnum -->
<!ENTITY iexcl "&#161;" ><!-- inverted exclamation mark, U+00A1 ISOnum -->
<!ENTITY cent "&#162;" ><!-- cent sign, U+00A2 ISOnum -->
<!ENTITY pound "&#163;" ><!-- pound sign, U+00A3 ISOnum -->
<!ENTITY curren "&#164;" ><!-- currency sign, U+00A4 ISOnum -->
<!ENTITY yen "&#165;" ><!-- yen sign = yuan sign, U+00A5 ISOnum -->
<!ENTITY brvbar "&#166;" ><!-- broken bar = broken vertical bar, U+00A6 ISOnum -->
<!ENTITY sect "&#167;" ><!-- section sign, U+00A7 ISOnum -->
<!ENTITY uml "&#168;" ><!-- diaeresis = spacing diaeresis, U+00A8 ISOdia -->
<!ENTITY copy "&#169;" ><!-- copyright sign, U+00A9 ISOnum -->
<!ENTITY ordf "&#170;" ><!-- feminine ordinal indicator, U+00AA ISOnum -->
<!ENTITY laquo "&#171;" ><!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum -->
<!ENTITY not "&#172;" ><!-- not sign, U+00AC ISOnum -->
<!ENTITY shy "&#173;" ><!-- soft hyphen = discretionary hyphen, U+00AD ISOnum -->
<!ENTITY reg "&#174;" ><!-- registered sign = registered trade mark sign, U+00AE ISOnum -->
<!ENTITY macr "&#175;" ><!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia -->
<!ENTITY deg "&#176;" ><!-- degree sign, U+00B0 ISOnum -->
<!ENTITY plusmn "&#177;" ><!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum -->
<!ENTITY sup2 "&#178;" ><!-- superscript two = superscript digit two = squared, U+00B2 ISOnum -->
<!ENTITY sup3 "&#179;" ><!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum -->
<!ENTITY acute "&#180;" ><!-- acute accent = spacing acute, U+00B4 ISOdia -->
<!ENTITY micro "&#181;" ><!-- micro sign, U+00B5 ISOnum -->
<!ENTITY para "&#182;" ><!-- pilcrow sign = paragraph sign, U+00B6 ISOnum -->
<!ENTITY middot "&#183;" ><!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum -->
<!ENTITY cedil "&#184;" ><!-- cedilla = spacing cedilla, U+00B8 ISOdia -->
<!ENTITY sup1 "&#185;" ><!-- superscript one = superscript digit one, U+00B9 ISOnum -->
<!ENTITY ordm "&#186;" ><!-- masculine ordinal indicator, U+00BA ISOnum -->
<!ENTITY raquo "&#187;" ><!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum -->
<!ENTITY frac14 "&#188;" ><!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum -->
<!ENTITY frac12 "&#189;" ><!-- vulgar fraction one half = fraction one half, U+00BD ISOnum -->
<!ENTITY frac34 "&#190;" ><!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum -->
<!ENTITY iquest "&#191;" ><!-- inverted question mark = turned question mark, U+00BF ISOnum -->
<!ENTITY Agrave "&#192;" ><!-- latin capital A with grave = latin capital A grave, U+00C0 ISOlat1 -->
<!ENTITY Aacute "&#193;" ><!-- latin capital A with acute, U+00C1 ISOlat1 -->
<!ENTITY Acirc "&#194;" ><!-- latin capital A with circumflex, U+00C2 ISOlat1 -->
<!ENTITY Atilde "&#195;" ><!-- latin capital A with tilde, U+00C3 ISOlat1 -->
<!ENTITY Auml "&#196;" ><!-- latin capital A with diaeresis, U+00C4 ISOlat1 -->
<!ENTITY Aring "&#197;" ><!-- latin capital A with ring above = latin capital A ring, U+00C5 ISOlat1 -->
<!ENTITY AElig "&#198;" ><!-- latin capital AE = latin capital ligature AE, U+00C6 ISOlat1 -->
<!ENTITY Ccedil "&#199;" ><!-- latin capital C with cedilla, U+00C7 ISOlat1 -->
<!ENTITY Egrave "&#200;" ><!-- latin capital E with grave, U+00C8 ISOlat1 -->
<!ENTITY Eacute "&#201;" ><!-- latin capital E with acute, U+00C9 ISOlat1 -->
<!ENTITY Ecirc "&#202;" ><!-- latin capital E with circumflex, U+00CA ISOlat1 -->
<!ENTITY Euml "&#203;" ><!-- latin capital E with diaeresis, U+00CB ISOlat1 -->
<!ENTITY Igrave "&#204;" ><!-- latin capital I with grave, U+00CC ISOlat1 -->
<!ENTITY Iacute "&#205;" ><!-- latin capital I with acute, U+00CD ISOlat1 -->
<!ENTITY Icirc "&#206;" ><!-- latin capital I with circumflex, U+00CE ISOlat1 -->
<!ENTITY Iuml "&#207;" ><!-- latin capital I with diaeresis, U+00CF ISOlat1 -->
<!ENTITY ETH "&#208;" ><!-- latin capital ETH, U+00D0 ISOlat1 -->
<!ENTITY Ntilde "&#209;" ><!-- latin capital N with tilde, U+00D1 ISOlat1 -->
<!ENTITY Ograve "&#210;" ><!-- latin capital O with grave, U+00D2 ISOlat1 -->
<!ENTITY Oacute "&#211;" ><!-- latin capital O with acute, U+00D3 ISOlat1 -->
<!ENTITY Ocirc "&#212;" ><!-- latin capital O with circumflex, U+00D4 ISOlat1 -->
<!ENTITY Otilde "&#213;" ><!-- latin capital O with tilde, U+00D5 ISOlat1 -->
<!ENTITY Ouml "&#214;" ><!-- latin capital O with diaeresis, U+00D6 ISOlat1 -->
<!ENTITY times "&#215;" ><!-- multiplication sign, U+00D7 ISOnum -->
<!ENTITY Oslash "&#216;" ><!-- latin capital O with stroke = latin capital O slash, U+00D8 ISOlat1 -->
<!ENTITY Ugrave "&#217;" ><!-- latin capital U with grave, U+00D9 ISOlat1 -->
<!ENTITY Uacute "&#218;" ><!-- latin capital U with acute, U+00DA ISOlat1 -->
<!ENTITY Ucirc "&#219;" ><!-- latin capital U with circumflex, U+00DB ISOlat1 -->
<!ENTITY Uuml "&#220;" ><!-- latin capital U with diaeresis, U+00DC ISOlat1 -->
<!ENTITY Yacute "&#221;" ><!-- latin capital Y with acute, U+00DD ISOlat1 -->
<!ENTITY THORN "&#222;" ><!-- latin capital THORN, U+00DE ISOlat1 -->
<!ENTITY szlig "&#223;" ><!-- latin small sharp s = ess-zed, U+00DF ISOlat1 -->
<!ENTITY agrave "&#224;" ><!-- latin small a with grave = latin small a grave, U+00E0 ISOlat1 -->
<!ENTITY aacute "&#225;" ><!-- latin small a with acute, U+00E1 ISOlat1 -->
<!ENTITY acirc "&#226;" ><!-- latin small a with circumflex, U+00E2 ISOlat1 -->
<!ENTITY atilde "&#227;" ><!-- latin small a with tilde, U+00E3 ISOlat1 -->
<!ENTITY auml "&#228;" ><!-- latin small a with diaeresis, U+00E4 ISOlat1 -->
<!ENTITY aring "&#229;" ><!-- latin small a with ring above = latin small a ring, U+00E5 ISOlat1 -->
<!ENTITY aelig "&#230;" ><!-- latin small ae = latin small ligature ae, U+00E6 ISOlat1 -->
<!ENTITY ccedil "&#231;" ><!-- latin small c with cedilla, U+00E7 ISOlat1 -->
<!ENTITY egrave "&#232;" ><!-- latin small e with grave, U+00E8 ISOlat1 -->
<!ENTITY eacute "&#233;" ><!-- latin small e with acute, U+00E9 ISOlat1 -->
<!ENTITY ecirc "&#234;" ><!-- latin small e with circumflex, U+00EA ISOlat1 -->
<!ENTITY euml "&#235;" ><!-- latin small e with diaeresis, U+00EB ISOlat1 -->
<!ENTITY igrave "&#236;" ><!-- latin small i with grave, U+00EC ISOlat1 -->
<!ENTITY iacute "&#237;" ><!-- latin small i with acute, U+00ED ISOlat1 -->
<!ENTITY icirc "&#238;" ><!-- latin small i with circumflex, U+00EE ISOlat1 -->
<!ENTITY iuml "&#239;" ><!-- latin small i with diaeresis, U+00EF ISOlat1 -->
<!ENTITY eth "&#240;" ><!-- latin small eth, U+00F0 ISOlat1 -->
<!ENTITY ntilde "&#241;" ><!-- latin small n with tilde, U+00F1 ISOlat1 -->
<!ENTITY ograve "&#242;" ><!-- latin small o with grave, U+00F2 ISOlat1 -->
<!ENTITY oacute "&#243;" ><!-- latin small o with acute, U+00F3 ISOlat1 -->
<!ENTITY ocirc "&#244;" ><!-- latin small o with circumflex, U+00F4 ISOlat1 -->
<!ENTITY otilde "&#245;" ><!-- latin small o with tilde, U+00F5 ISOlat1 -->
<!ENTITY ouml "&#246;" ><!-- latin small o with diaeresis, U+00F6 ISOlat1 -->
<!ENTITY divide "&#247;" ><!-- division sign, U+00F7 ISOnum -->
<!ENTITY oslash "&#248;" ><!-- latin small o with stroke, = latin small o slash, U+00F8 ISOlat1 -->
<!ENTITY ugrave "&#249;" ><!-- latin small u with grave, U+00F9 ISOlat1 -->
<!ENTITY uacute "&#250;" ><!-- latin small u with acute, U+00FA ISOlat1 -->
<!ENTITY ucirc "&#251;" ><!-- latin small u with circumflex, U+00FB ISOlat1 -->
<!ENTITY uuml "&#252;" ><!-- latin small u with diaeresis, U+00FC ISOlat1 -->
<!ENTITY yacute "&#253;" ><!-- latin small y with acute, U+00FD ISOlat1 -->
<!ENTITY thorn "&#254;" ><!-- latin small thorn with, U+00FE ISOlat1 -->
<!ENTITY yuml "&#255;" ><!-- latin small y with diaeresis, U+00FF ISOlat1 -->
<!-- end of xhtml-lat1.ent -->
F.1.2. XHTML Special Characters
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-special.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-special.ent.
<!-- ...................................................................... -->
<!-- XML-compatible ISO Special Character Entity Set for XHTML ............ -->
<!-- file: xhtml-special.ent
Typical invocation:
<!ENTITY % xhtml-special
PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
"xhtml-special.ent" >
%xhtml-special;
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-special.ent"
Revision: $Id: xhtml-special.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
Portions (C) International Organization for Standardization 1986:
Permission to copy in any form is granted for use with conforming
SGML systems and applications as defined in ISO 8879, provided
this notice is included in all copies.
Revisions:
2000-10-28: added &apos; and altered XML Predefined Entities for compatibility
-->
<!-- Relevant ISO entity set is given unless names are newly introduced.
New names (i.e., not in ISO 8879 [SGML] list) do not clash with
any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
numbers are given for each character, in hex. Entity values are
decimal conversions of the ISO 10646 values and refer to the
document character set. Names are Unicode [UNICODE] names.
-->
<!-- C0 Controls and Basic Latin -->
<!ENTITY lt "&#38;#60;" ><!-- less-than sign, U+003C ISOnum -->
<!ENTITY gt "&#62;" ><!-- greater-than sign, U+003E ISOnum -->
<!ENTITY amp "&#38;#38;" ><!-- ampersand, U+0026 ISOnum -->
<!ENTITY apos "&#39;" ><!-- The Apostrophe (Apostrophe Quote, APL Quote), U+0027 ISOnum -->
<!ENTITY quot "&#34;" ><!-- quotation mark (Quote Double), U+0022 ISOnum -->
<!-- Latin Extended-A -->
<!ENTITY OElig "&#338;" ><!-- latin capital ligature OE, U+0152 ISOlat2 -->
<!ENTITY oelig "&#339;" ><!-- latin small ligature oe, U+0153 ISOlat2 -->
<!-- ligature is a misnomer, this is a separate character in some languages -->
<!ENTITY Scaron "&#352;" ><!-- latin capital letter S with caron, U+0160 ISOlat2 -->
<!ENTITY scaron "&#353;" ><!-- latin small letter s with caron, U+0161 ISOlat2 -->
<!ENTITY Yuml "&#376;" ><!-- latin capital letter Y with diaeresis, U+0178 ISOlat2 -->
<!-- Spacing Modifier Letters -->
<!ENTITY circ "&#710;" ><!-- modifier letter circumflex accent, U+02C6 ISOpub -->
<!ENTITY tilde "&#732;" ><!-- small tilde, U+02DC ISOdia -->
<!-- General Punctuation -->
<!ENTITY ensp "&#8194;" ><!-- en space, U+2002 ISOpub -->
<!ENTITY emsp "&#8195;" ><!-- em space, U+2003 ISOpub -->
<!ENTITY thinsp "&#8201;" ><!-- thin space, U+2009 ISOpub -->
<!ENTITY zwnj "&#8204;" ><!-- zero width non-joiner, U+200C NEW RFC 2070 -->
<!ENTITY zwj "&#8205;" ><!-- zero width joiner, U+200D NEW RFC 2070 -->
<!ENTITY lrm "&#8206;" ><!-- left-to-right mark, U+200E NEW RFC 2070 -->
<!ENTITY rlm "&#8207;" ><!-- right-to-left mark, U+200F NEW RFC 2070 -->
<!ENTITY ndash "&#8211;" ><!-- en dash, U+2013 ISOpub -->
<!ENTITY mdash "&#8212;" ><!-- em dash, U+2014 ISOpub -->
<!ENTITY lsquo "&#8216;" ><!-- left single quotation mark, U+2018 ISOnum -->
<!ENTITY rsquo "&#8217;" ><!-- right single quotation mark, U+2019 ISOnum -->
<!ENTITY sbquo "&#8218;" ><!-- single low-9 quotation mark, U+201A NEW -->
<!ENTITY ldquo "&#8220;" ><!-- left double quotation mark, U+201C ISOnum -->
<!ENTITY rdquo "&#8221;" ><!-- right double quotation mark, U+201D ISOnum -->
<!ENTITY bdquo "&#8222;" ><!-- double low-9 quotation mark, U+201E NEW -->
<!ENTITY dagger "&#8224;" ><!-- dagger, U+2020 ISOpub -->
<!ENTITY Dagger "&#8225;" ><!-- double dagger, U+2021 ISOpub -->
<!ENTITY permil "&#8240;" ><!-- per mille sign, U+2030 ISOtech -->
<!-- lsaquo is proposed but not yet ISO standardized -->
<!ENTITY lsaquo "&#8249;" ><!-- single left-pointing angle quotation mark, U+2039 ISO proposed -->
<!-- rsaquo is proposed but not yet ISO standardized -->
<!ENTITY rsaquo "&#8250;" ><!-- single right-pointing angle quotation mark, U+203A ISO proposed -->
<!ENTITY euro "&#8364;" ><!-- euro sign, U+20AC NEW -->
<!-- end of xhtml-special.ent -->
F.1.3. XHTML Mathematical, Greek, and Symbolic Characters
You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-symbol.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent.
<!-- ...................................................................... -->
<!-- ISO Math, Greek and Symbolic Character Entity Set for XHTML .......... -->
<!-- file: xhtml-symbol.ent
Typical invocation:
<!ENTITY % xhtml-symbol
PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
"xhtml-symbol.ent" >
%xhtml-symbol;
This DTD module is identified by the PUBLIC and SYSTEM identifiers:
PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent"
Revision: $Id: xhtml-symbol.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
Portions (C) International Organization for Standardization 1986:
Permission to copy in any form is granted for use with conforming
SGML systems and applications as defined in ISO 8879, provided
this notice is included in all copies.
-->
<!-- Relevant ISO entity set is given unless names are newly introduced.
New names (i.e., not in ISO 8879 [SGML] list) do not clash with
any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
numbers are given for each character, in hex. Entity values are
decimal conversions of the ISO 10646 values and refer to the
document character set. Names are Unicode [UNICODE] names.
-->
<!-- Latin Extended-B -->
<!ENTITY fnof "&#402;" ><!-- latin small f with hook = function
= florin, U+0192 ISOtech -->
<!-- Greek -->
<!ENTITY Alpha "&#913;" ><!-- greek capital letter alpha, U+0391 -->
<!ENTITY Beta "&#914;" ><!-- greek capital letter beta, U+0392 -->
<!ENTITY Gamma "&#915;" ><!-- greek capital letter gamma, U+0393 ISOgrk3 -->
<!ENTITY Delta "&#916;" ><!-- greek capital letter delta, U+0394 ISOgrk3 -->
<!ENTITY Epsilon "&#917;" ><!-- greek capital letter epsilon, U+0395 -->
<!ENTITY Zeta "&#918;" ><!-- greek capital letter zeta, U+0396 -->
<!ENTITY Eta "&#919;" ><!-- greek capital letter eta, U+0397 -->
<!ENTITY Theta "&#920;" ><!-- greek capital letter theta, U+0398 ISOgrk3 -->
<!ENTITY Iota "&#921;" ><!-- greek capital letter iota, U+0399 -->
<!ENTITY Kappa "&#922;" ><!-- greek capital letter kappa, U+039A -->
<!ENTITY Lambda "&#923;" ><!-- greek capital letter lambda, U+039B ISOgrk3 -->
<!ENTITY Mu "&#924;" ><!-- greek capital letter mu, U+039C -->
<!ENTITY Nu "&#925;" ><!-- greek capital letter nu, U+039D -->
<!ENTITY Xi "&#926;" ><!-- greek capital letter xi, U+039E ISOgrk3 -->
<!ENTITY Omicron "&#927;" ><!-- greek capital letter omicron, U+039F -->
<!ENTITY Pi "&#928;" ><!-- greek capital letter pi, U+03A0 ISOgrk3 -->
<!ENTITY Rho "&#929;" ><!-- greek capital letter rho, U+03A1 -->
<!-- there is no Sigmaf, and no U+03A2 character either -->
<!ENTITY Sigma "&#931;" ><!-- greek capital letter sigma, U+03A3 ISOgrk3 -->
<!ENTITY Tau "&#932;" ><!-- greek capital letter tau, U+03A4 -->
<!ENTITY Upsilon "&#933;" ><!-- greek capital letter upsilon,
U+03A5 ISOgrk3 -->
<!ENTITY Phi "&#934;" ><!-- greek capital letter phi, U+03A6 ISOgrk3 -->
<!ENTITY Chi "&#935;" ><!-- greek capital letter chi, U+03A7 -->
<!ENTITY Psi "&#936;" ><!-- greek capital letter psi, U+03A8 ISOgrk3 -->
<!ENTITY Omega "&#937;" ><!-- greek capital letter omega, U+03A9 ISOgrk3 -->
<!ENTITY alpha "&#945;" ><!-- greek small letter alpha, U+03B1 ISOgrk3 -->
<!ENTITY beta "&#946;" ><!-- greek small letter beta, U+03B2 ISOgrk3 -->
<!ENTITY gamma "&#947;" ><!-- greek small letter gamma, U+03B3 ISOgrk3 -->
<!ENTITY delta "&#948;" ><!-- greek small letter delta, U+03B4 ISOgrk3 -->
<!ENTITY epsilon "&#949;" ><!-- greek small letter epsilon, U+03B5 ISOgrk3 -->
<!ENTITY zeta "&#950;" ><!-- greek small letter zeta, U+03B6 ISOgrk3 -->
<!ENTITY eta "&#951;" ><!-- greek small letter eta, U+03B7 ISOgrk3 -->
<!ENTITY theta "&#952;" ><!-- greek small letter theta, U+03B8 ISOgrk3 -->
<!ENTITY iota "&#953;" ><!-- greek small letter iota, U+03B9 ISOgrk3 -->
<!ENTITY kappa "&#954;" ><!-- greek small letter kappa, U+03BA ISOgrk3 -->
<!ENTITY lambda "&#955;" ><!-- greek small letter lambda, U+03BB ISOgrk3 -->
<!ENTITY mu "&#956;" ><!-- greek small letter mu, U+03BC ISOgrk3 -->
<!ENTITY nu "&#957;" ><!-- greek small letter nu, U+03BD ISOgrk3 -->
<!ENTITY xi "&#958;" ><!-- greek small letter xi, U+03BE ISOgrk3 -->
<!ENTITY omicron "&#959;" ><!-- greek small letter omicron, U+03BF NEW -->
<!ENTITY pi "&#960;" ><!-- greek small letter pi, U+03C0 ISOgrk3 -->
<!ENTITY rho "&#961;" ><!-- greek small letter rho, U+03C1 ISOgrk3 -->
<!ENTITY sigmaf "&#962;" ><!-- greek small letter final sigma, U+03C2 ISOgrk3 -->
<!ENTITY sigma "&#963;" ><!-- greek small letter sigma, U+03C3 ISOgrk3 -->
<!ENTITY tau "&#964;" ><!-- greek small letter tau, U+03C4 ISOgrk3 -->
<!ENTITY upsilon "&#965;" ><!-- greek small letter upsilon, U+03C5 ISOgrk3 -->
<!ENTITY phi "&#966;" ><!-- greek small letter phi, U+03C6 ISOgrk3 -->
<!ENTITY chi "&#967;" ><!-- greek small letter chi, U+03C7 ISOgrk3 -->
<!ENTITY psi "&#968;" ><!-- greek small letter psi, U+03C8 ISOgrk3 -->
<!ENTITY omega "&#969;" ><!-- greek small letter omega, U+03C9 ISOgrk3 -->
<!ENTITY thetasym "&#977;" ><!-- greek small letter theta symbol, U+03D1 NEW -->
<!ENTITY upsih "&#978;" ><!-- greek upsilon with hook symbol, U+03D2 NEW -->
<!ENTITY piv "&#982;" ><!-- greek pi symbol, U+03D6 ISOgrk3 -->
<!-- General Punctuation -->
<!ENTITY bull "&#8226;" ><!-- bullet = black small circle, U+2022 ISOpub -->
<!-- bullet is NOT the same as bullet operator, U+2219 -->
<!ENTITY hellip "&#8230;" ><!-- horizontal ellipsis = three dot leader, U+2026 ISOpub -->
<!ENTITY prime "&#8242;" ><!-- prime = minutes = feet, U+2032 ISOtech -->
<!ENTITY Prime "&#8243;" ><!-- double prime = seconds = inches, U+2033 ISOtech -->
<!ENTITY oline "&#8254;" ><!-- overline = spacing overscore, U+203E NEW -->
<!ENTITY frasl "&#8260;" ><!-- fraction slash, U+2044 NEW -->
<!-- Letterlike Symbols -->
<!ENTITY weierp "&#8472;" ><!-- script capital P = power set = Weierstrass p, U+2118 ISOamso -->
<!ENTITY image "&#8465;" ><!-- blackletter capital I = imaginary part, U+2111 ISOamso -->
<!ENTITY real "&#8476;" ><!-- blackletter capital R = real part symbol, U+211C ISOamso -->
<!ENTITY trade "&#8482;" ><!-- trade mark sign, U+2122 ISOnum -->
<!ENTITY alefsym "&#8501;" ><!-- alef symbol = first transfinite cardinal, U+2135 NEW -->
<!-- alef symbol is NOT the same as hebrew letter alef, U+05D0 although
the same glyph could be used to depict both characters -->
<!-- Arrows -->
<!ENTITY larr "&#8592;" ><!-- leftwards arrow, U+2190 ISOnum -->
<!ENTITY uarr "&#8593;" ><!-- upwards arrow, U+2191 ISOnum-->
<!ENTITY rarr "&#8594;" ><!-- rightwards arrow, U+2192 ISOnum -->
<!ENTITY darr "&#8595;" ><!-- downwards arrow, U+2193 ISOnum -->
<!ENTITY harr "&#8596;" ><!-- left right arrow, U+2194 ISOamsa -->
<!ENTITY crarr "&#8629;" ><!-- downwards arrow with corner leftwards
= carriage return, U+21B5 NEW -->
<!ENTITY lArr "&#8656;" ><!-- leftwards double arrow, U+21D0 ISOtech -->
<!-- Unicode does not say that lArr is the same as the 'is implied by' arrow
but also does not have any other character for that function. So ? lArr can
be used for 'is implied by' as ISOtech suggests -->
<!ENTITY uArr "&#8657;" ><!-- upwards double arrow, U+21D1 ISOamsa -->
<!ENTITY rArr "&#8658;" ><!-- rightwards double arrow, U+21D2 ISOtech -->
<!-- Unicode does not say this is the 'implies' character but does not have
another character with this function so ?
rArr can be used for 'implies' as ISOtech suggests -->
<!ENTITY dArr "&#8659;" ><!-- downwards double arrow, U+21D3 ISOamsa -->
<!ENTITY hArr "&#8660;" ><!-- left right double arrow, U+21D4 ISOamsa -->
<!-- Mathematical Operators -->
<!ENTITY forall "&#8704;" ><!-- for all, U+2200 ISOtech -->
<!ENTITY part "&#8706;" ><!-- partial differential, U+2202 ISOtech -->
<!ENTITY exist "&#8707;" ><!-- there exists, U+2203 ISOtech -->
<!ENTITY empty "&#8709;" ><!-- empty set = null set, U+2205 ISOamso -->
<!ENTITY nabla "&#8711;" ><!-- nabla = backward difference, U+2207 ISOtech -->
<!ENTITY isin "&#8712;" ><!-- element of, U+2208 ISOtech -->
<!ENTITY notin "&#8713;" ><!-- not an element of, U+2209 ISOtech -->
<!ENTITY ni "&#8715;" ><!-- contains as member, U+220B ISOtech -->
<!-- should there be a more memorable name than 'ni'? -->
<!ENTITY prod "&#8719;" ><!-- n-ary product = product sign, U+220F ISOamsb -->
<!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
the same glyph might be used for both -->
<!ENTITY sum "&#8721;" ><!-- n-ary sumation, U+2211 ISOamsb -->
<!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
though the same glyph might be used for both -->
<!ENTITY minus "&#8722;" ><!-- minus sign, U+2212 ISOtech -->
<!ENTITY lowast "&#8727;" ><!-- asterisk operator, U+2217 ISOtech -->
<!ENTITY radic "&#8730;" ><!-- square root = radical sign, U+221A ISOtech -->
<!ENTITY prop "&#8733;" ><!-- proportional to, U+221D ISOtech -->
<!ENTITY infin "&#8734;" ><!-- infinity, U+221E ISOtech -->
<!ENTITY ang "&#8736;" ><!-- angle, U+2220 ISOamso -->
<!ENTITY and "&#8743;" ><!-- logical and = wedge, U+2227 ISOtech -->
<!ENTITY or "&#8744;" ><!-- logical or = vee, U+2228 ISOtech -->
<!ENTITY cap "&#8745;" ><!-- intersection = cap, U+2229 ISOtech -->
<!ENTITY cup "&#8746;" ><!-- union = cup, U+222A ISOtech -->
<!ENTITY int "&#8747;" ><!-- integral, U+222B ISOtech -->
<!ENTITY there4 "&#8756;" ><!-- therefore, U+2234 ISOtech -->
<!ENTITY sim "&#8764;" ><!-- tilde operator = varies with = similar to, U+223C ISOtech -->
<!-- tilde operator is NOT the same character as the tilde, U+007E,
although the same glyph might be used to represent both -->
<!ENTITY cong "&#8773;" ><!-- approximately equal to, U+2245 ISOtech -->
<!ENTITY asymp "&#8776;" ><!-- almost equal to = asymptotic to, U+2248 ISOamsr -->
<!ENTITY ne "&#8800;" ><!-- not equal to, U+2260 ISOtech -->
<!ENTITY equiv "&#8801;" ><!-- identical to, U+2261 ISOtech -->
<!ENTITY le "&#8804;" ><!-- less-than or equal to, U+2264 ISOtech -->
<!ENTITY ge "&#8805;" ><!-- greater-than or equal to, U+2265 ISOtech -->
<!ENTITY sub "&#8834;" ><!-- subset of, U+2282 ISOtech -->
<!ENTITY sup "&#8835;" ><!-- superset of, U+2283 ISOtech -->
<!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
font encoding and is not included. Should it be, for symmetry?
It is in ISOamsn -->
<!ENTITY nsub "&#8836;" ><!-- not a subset of, U+2284 ISOamsn -->
<!ENTITY sube "&#8838;" ><!-- subset of or equal to, U+2286 ISOtech -->
<!ENTITY supe "&#8839;" ><!-- superset of or equal to, U+2287 ISOtech -->
<!ENTITY oplus "&#8853;" ><!-- circled plus = direct sum, U+2295 ISOamsb -->
<!ENTITY otimes "&#8855;" ><!-- circled times = vector product, U+2297 ISOamsb -->
<!ENTITY perp "&#8869;" ><!-- up tack = orthogonal to = perpendicular, U+22A5 ISOtech -->
<!ENTITY sdot "&#8901;" ><!-- dot operator, U+22C5 ISOamsb -->
<!-- dot operator is NOT the same character as U+00B7 middle dot -->
<!-- Miscellaneous Technical -->
<!ENTITY lceil "&#8968;" ><!-- left ceiling = apl upstile, U+2308 ISOamsc -->
<!ENTITY rceil "&#8969;" ><!-- right ceiling, U+2309 ISOamsc -->
<!ENTITY lfloor "&#8970;" ><!-- left floor = apl downstile, U+230A ISOamsc -->
<!ENTITY rfloor "&#8971;" ><!-- right floor, U+230B ISOamsc -->
<!ENTITY lang "&#9001;" ><!-- left-pointing angle bracket = bra, U+2329 ISOtech -->
<!-- lang is NOT the same character as U+003C 'less than'
or U+2039 'single left-pointing angle quotation mark' -->
<!ENTITY rang "&#9002;" ><!-- right-pointing angle bracket = ket, U+232A ISOtech -->
<!-- rang is NOT the same character as U+003E 'greater than'
or U+203A 'single right-pointing angle quotation mark' -->
<!-- Geometric Shapes -->
<!ENTITY loz "&#9674;" ><!-- lozenge, U+25CA ISOpub -->
<!-- Miscellaneous Symbols -->
<!ENTITY spades "&#9824;" ><!-- black spade suit, U+2660 ISOpub -->
<!-- black here seems to mean filled as opposed to hollow -->
<!ENTITY clubs "&#9827;" ><!-- black club suit = shamrock, U+2663 ISOpub -->
<!ENTITY hearts "&#9829;" ><!-- black heart suit = valentine, U+2665 ISOpub -->
<!ENTITY diams "&#9830;" ><!-- black diamond suit, U+2666 ISOpub -->
<!-- end of xhtml-symbol.ent -->
"""
return text
def get_apache_license():
license = r"""/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
"""
return license
main()

61
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/package.html

@ -0,0 +1,61 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
<p>
Normalization of text before the tokenizer.
</p>
<p>
CharFilters are chainable filters that normalize text before tokenization
and provide mappings between normalized text offsets and the corresponding
offset in the original text.
</p>
<H2>CharFilter offset mappings</H2>
<p>
CharFilters modify an input stream via a series of substring
replacements (including deletions and insertions) to produce an output
stream. There are three possible replacement cases: the replacement
string has the same length as the original substring; the replacement
is shorter; and the replacement is longer. In the latter two cases
(when the replacement has a different length than the original),
one or more offset correction mappings are required.
</p>
<p>
When the replacement is shorter than the original (e.g. when the
replacement is the empty string), a single offset correction mapping
should be added at the replacement's end offset in the output stream.
The <code>cumulativeDiff</code> parameter to the
<code>addOffCorrectMapping()</code> method will be the sum of all
previous replacement offset adjustments, with the addition of the
difference between the lengths of the original substring and the
replacement string (a positive value).
</p>
<p>
When the replacement is longer than the original (e.g. when the
original is the empty string), you should add as many offset
correction mappings as the difference between the lengths of the
replacement string and the original substring, starting at the
end offset the original substring would have had in the output stream.
The <code>cumulativeDiff</code> parameter to the
<code>addOffCorrectMapping()</code> method will be the sum of all
previous replacement offset adjustments, with the addition of the
difference between the lengths of the original substring and the
replacement string so far (a negative value).
</p>
</body>
</html>

104
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKAnalyzer.java

@ -0,0 +1,104 @@
package com.fr.third.org.apache.lucene.analysis.cjk;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import com.fr.third.org.apache.lucene.analysis.Analyzer;
import com.fr.third.org.apache.lucene.analysis.TokenStream;
import com.fr.third.org.apache.lucene.analysis.Tokenizer;
import com.fr.third.org.apache.lucene.analysis.core.LowerCaseFilter;
import com.fr.third.org.apache.lucene.analysis.core.StopFilter;
import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer;
import com.fr.third.org.apache.lucene.analysis.util.CharArraySet;
import com.fr.third.org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import com.fr.third.org.apache.lucene.util.Version;
/**
* An {@link Analyzer} that tokenizes text with {@link StandardTokenizer},
* normalizes content with {@link CJKWidthFilter}, folds case with
* {@link LowerCaseFilter}, forms bigrams of CJK with {@link CJKBigramFilter},
* and filters stopwords with {@link StopFilter}
*/
public final class CJKAnalyzer extends StopwordAnalyzerBase {
/**
* File containing default CJK stopwords.
* <p/>
* Currently it contains some common English words that are not usually
* useful for searching and some double-byte interpunctions.
*/
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
/**
* Returns an unmodifiable instance of the default stop-words set.
* @return an unmodifiable instance of the default stop-words set.
*/
public static CharArraySet getDefaultStopSet(){
return DefaultSetHolder.DEFAULT_STOP_SET;
}
private static class DefaultSetHolder {
static final CharArraySet DEFAULT_STOP_SET;
static {
try {
DEFAULT_STOP_SET = loadStopwordSet(false, CJKAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new RuntimeException("Unable to load default stopword set");
}
}
}
/**
* Builds an analyzer which removes words in {@link #getDefaultStopSet()}.
*/
public CJKAnalyzer(Version matchVersion) {
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
* Builds an analyzer with the given stop words
*
* @param matchVersion
* lucene compatibility version
* @param stopwords
* a stopword set
*/
public CJKAnalyzer(Version matchVersion, CharArraySet stopwords){
super(matchVersion, stopwords);
}
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
// run the widthfilter first before bigramming, it sometimes combines characters.
TokenStream result = new CJKWidthFilter(source);
result = new LowerCaseFilter(matchVersion, result);
result = new CJKBigramFilter(result);
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
} else {
final Tokenizer source = new CJKTokenizer(reader);
return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
}
}
}

363
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKBigramFilter.java

@ -0,0 +1,363 @@
package com.fr.third.org.apache.lucene.analysis.cjk;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import com.fr.third.org.apache.lucene.analysis.TokenFilter;
import com.fr.third.org.apache.lucene.analysis.TokenStream;
import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import com.fr.third.org.apache.lucene.util.ArrayUtil;
/**
* Forms bigrams of CJK terms that are generated from StandardTokenizer
* or ICUTokenizer.
* <p>
* CJK types are set by these tokenizers, but you can also use
* {@link #CJKBigramFilter(TokenStream, int)} to explicitly control which
* of the CJK scripts are turned into bigrams.
* <p>
* By default, when a CJK character has no adjacent characters to form
* a bigram, it is output in unigram form. If you want to always output
* both unigrams and bigrams, set the <code>outputUnigrams</code>
* flag in {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)}.
* This can be used for a combined unigram+bigram approach.
* <p>
* In all cases, all non-CJK input is passed thru unmodified.
*/
public final class CJKBigramFilter extends TokenFilter {
// configuration
/** bigram flag for Han Ideographs */
public static final int HAN = 1;
/** bigram flag for Hiragana */
public static final int HIRAGANA = 2;
/** bigram flag for Katakana */
public static final int KATAKANA = 4;
/** bigram flag for Hangul */
public static final int HANGUL = 8;
/** when we emit a bigram, its then marked as this type */
public static final String DOUBLE_TYPE = "<DOUBLE>";
/** when we emit a unigram, its then marked as this type */
public static final String SINGLE_TYPE = "<SINGLE>";
// the types from standardtokenizer
private static final String HAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
private static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
private static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
private static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
// sentinel value for ignoring a script
private static final Object NO = new Object();
// these are set to either their type or NO if we want to pass them thru
private final Object doHan;
private final Object doHiragana;
private final Object doKatakana;
private final Object doHangul;
// true if we should output unigram tokens always
private final boolean outputUnigrams;
private boolean ngramState; // false = output unigram, true = output bigram
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
// buffers containing codepoint and offsets in parallel
int buffer[] = new int[8];
int startOffset[] = new int[8];
int endOffset[] = new int[8];
// length of valid buffer
int bufferLen;
// current buffer index
int index;
// the last end offset, to determine if we should bigram across tokens
int lastEndOffset;
private boolean exhausted;
/**
* Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int)
* CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)}
*/
public CJKBigramFilter(TokenStream in) {
this(in, HAN | HIRAGANA | KATAKANA | HANGUL);
}
/**
* Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)
* CJKBigramFilter(in, flags, false)}
*/
public CJKBigramFilter(TokenStream in, int flags) {
this(in, flags, false);
}
/**
* Create a new CJKBigramFilter, specifying which writing systems should be bigrammed,
* and whether or not unigrams should also be output.
* @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA},
* {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
* @param outputUnigrams true if unigrams for the selected writing systems should also be output.
* when this is false, this is only done when there are no adjacent characters to form
* a bigram.
*/
public CJKBigramFilter(TokenStream in, int flags, boolean outputUnigrams) {
super(in);
doHan = (flags & HAN) == 0 ? NO : HAN_TYPE;
doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE;
doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE;
this.outputUnigrams = outputUnigrams;
}
/*
* much of this complexity revolves around handling the special case of a
* "lone cjk character" where cjktokenizer would output a unigram. this
* is also the only time we ever have to captureState.
*/
@Override
public boolean incrementToken() throws IOException {
while (true) {
if (hasBufferedBigram()) {
// case 1: we have multiple remaining codepoints buffered,
// so we can emit a bigram here.
if (outputUnigrams) {
// when also outputting unigrams, we output the unigram first,
// then rewind back to revisit the bigram.
// so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
// the logic in hasBufferedUnigram ensures we output the C,
// even though it did actually have adjacent CJK characters.
if (ngramState) {
flushBigram();
} else {
flushUnigram();
index--;
}
ngramState = !ngramState;
} else {
flushBigram();
}
return true;
} else if (doNext()) {
// case 2: look at the token type. should we form any n-grams?
String type = typeAtt.type();
if (type == doHan || type == doHiragana || type == doKatakana || type == doHangul) {
// acceptable CJK type: we form n-grams from these.
// as long as the offsets are aligned, we just add these to our current buffer.
// otherwise, we clear the buffer and start over.
if (offsetAtt.startOffset() != lastEndOffset) { // unaligned, clear queue
if (hasBufferedUnigram()) {
// we have a buffered unigram, and we peeked ahead to see if we could form
// a bigram, but we can't, because the offsets are unaligned. capture the state
// of this peeked data to be revisited next time thru the loop, and dump our unigram.
loneState = captureState();
flushUnigram();
return true;
}
index = 0;
bufferLen = 0;
}
refill();
} else {
// not a CJK type: we just return these as-is.
if (hasBufferedUnigram()) {
// we have a buffered unigram, and we peeked ahead to see if we could form
// a bigram, but we can't, because its not a CJK type. capture the state
// of this peeked data to be revisited next time thru the loop, and dump our unigram.
loneState = captureState();
flushUnigram();
return true;
}
return true;
}
} else {
// case 3: we have only zero or 1 codepoints buffered,
// so not enough to form a bigram. But, we also have no
// more input. So if we have a buffered codepoint, emit
// a unigram, otherwise, its end of stream.
if (hasBufferedUnigram()) {
flushUnigram(); // flush our remaining unigram
return true;
}
return false;
}
}
}
private State loneState; // rarely used: only for "lone cjk characters", where we emit unigrams
/**
* looks at next input token, returning false is none is available
*/
private boolean doNext() throws IOException {
if (loneState != null) {
restoreState(loneState);
loneState = null;
return true;
} else {
if (exhausted) {
return false;
} else if (input.incrementToken()) {
return true;
} else {
exhausted = true;
return false;
}
}
}
/**
* refills buffers with new data from the current token.
*/
private void refill() {
// compact buffers to keep them smallish if they become large
// just a safety check, but technically we only need the last codepoint
if (bufferLen > 64) {
int last = bufferLen - 1;
buffer[0] = buffer[last];
startOffset[0] = startOffset[last];
endOffset[0] = endOffset[last];
bufferLen = 1;
index -= last;
}
char termBuffer[] = termAtt.buffer();
int len = termAtt.length();
int start = offsetAtt.startOffset();
int end = offsetAtt.endOffset();
int newSize = bufferLen + len;
buffer = ArrayUtil.grow(buffer, newSize);
startOffset = ArrayUtil.grow(startOffset, newSize);
endOffset = ArrayUtil.grow(endOffset, newSize);
lastEndOffset = end;
if (end - start != len) {
// crazy offsets (modified by synonym or charfilter): just preserve
for (int i = 0, cp = 0; i < len; i += Character.charCount(cp)) {
cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len);
startOffset[bufferLen] = start;
endOffset[bufferLen] = end;
bufferLen++;
}
} else {
// normal offsets
for (int i = 0, cp = 0, cpLen = 0; i < len; i += cpLen) {
cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len);
cpLen = Character.charCount(cp);
startOffset[bufferLen] = start;
start = endOffset[bufferLen] = start + cpLen;
bufferLen++;
}
}
}
/**
* Flushes a bigram token to output from our buffer
* This is the normal case, e.g. ABC -> AB BC
*/
private void flushBigram() {
clearAttributes();
char termBuffer[] = termAtt.resizeBuffer(4); // maximum bigram length in code units (2 supplementaries)
int len1 = Character.toChars(buffer[index], termBuffer, 0);
int len2 = len1 + Character.toChars(buffer[index+1], termBuffer, len1);
termAtt.setLength(len2);
offsetAtt.setOffset(startOffset[index], endOffset[index+1]);
typeAtt.setType(DOUBLE_TYPE);
// when outputting unigrams, all bigrams are synonyms that span two unigrams
if (outputUnigrams) {
posIncAtt.setPositionIncrement(0);
posLengthAtt.setPositionLength(2);
}
index++;
}
/**
* Flushes a unigram token to output from our buffer.
* This happens when we encounter isolated CJK characters, either the whole
* CJK string is a single character, or we encounter a CJK character surrounded
* by space, punctuation, english, etc, but not beside any other CJK.
*/
private void flushUnigram() {
clearAttributes();
char termBuffer[] = termAtt.resizeBuffer(2); // maximum unigram length (2 surrogates)
int len = Character.toChars(buffer[index], termBuffer, 0);
termAtt.setLength(len);
offsetAtt.setOffset(startOffset[index], endOffset[index]);
typeAtt.setType(SINGLE_TYPE);
index++;
}
/**
* True if we have multiple codepoints sitting in our buffer
*/
private boolean hasBufferedBigram() {
return bufferLen - index > 1;
}
/**
* True if we have a single codepoint sitting in our buffer, where its future
* (whether it is emitted as unigram or forms a bigram) depends upon not-yet-seen
* inputs.
*/
private boolean hasBufferedUnigram() {
if (outputUnigrams) {
// when outputting unigrams always
return bufferLen - index == 1;
} else {
// otherwise its only when we have a lone CJK character
return bufferLen == 1 && index == 0;
}
}
@Override
public void reset() throws IOException {
super.reset();
bufferLen = 0;
index = 0;
lastEndOffset = 0;
loneState = null;
exhausted = false;
ngramState = false;
}
}

67
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java

@ -0,0 +1,67 @@
package com.fr.third.org.apache.lucene.analysis.cjk;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Map;
import com.fr.third.org.apache.lucene.analysis.TokenStream;
import com.fr.third.org.apache.lucene.analysis.cjk.CJKBigramFilter;
import com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link CJKBigramFilter}.
* <pre class="prettyprint" >
* &lt;fieldType name="text_cjk" class="solr.TextField"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.CJKWidthFilterFactory"/&gt;
* &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.CJKBigramFilterFactory"
* han="true" hiragana="true"
* katakana="true" hangul="true" outputUnigrams="false" /&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class CJKBigramFilterFactory extends TokenFilterFactory {
int flags;
boolean outputUnigrams;
@Override
public void init(Map<String,String> args) {
super.init(args);
flags = 0;
if (getBoolean("han", true)) {
flags |= CJKBigramFilter.HAN;
}
if (getBoolean("hiragana", true)) {
flags |= CJKBigramFilter.HIRAGANA;
}
if (getBoolean("katakana", true)) {
flags |= CJKBigramFilter.KATAKANA;
}
if (getBoolean("hangul", true)) {
flags |= CJKBigramFilter.HANGUL;
}
outputUnigrams = getBoolean("outputUnigrams", false);
}
@Override
public TokenStream create(TokenStream input) {
return new CJKBigramFilter(input, flags, outputUnigrams);
}
}

311
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKTokenizer.java

@ -0,0 +1,311 @@
package com.fr.third.org.apache.lucene.analysis.cjk;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import com.fr.third.org.apache.lucene.analysis.Tokenizer;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import com.fr.third.org.apache.lucene.util.AttributeSource;
/**
* CJKTokenizer is designed for Chinese, Japanese, and Korean languages.
* <p>
* The tokens returned are every two adjacent characters with overlap match.
* </p>
* <p>
* Example: "java C1C2C3C4" will be segmented to: "java" "C1C2" "C2C3" "C3C4".
* </p>
* Additionally, the following is applied to Latin text (such as English):
* <ul>
* <li>Text is converted to lowercase.
* <li>Numeric digits, '+', '#', and '_' are tokenized as letters.
* <li>Full-width forms are converted to half-width forms.
* </ul>
* For more info on Asian language (Chinese, Japanese, and Korean) text segmentation:
* please search <a
* href="http://www.google.com/search?q=word+chinese+segment">google</a>
*
* @deprecated Use StandardTokenizer, CJKWidthFilter, CJKBigramFilter, and LowerCaseFilter instead.
*/
@Deprecated
public final class CJKTokenizer extends Tokenizer {
//~ Static fields/initializers ---------------------------------------------
/** Word token type */
static final int WORD_TYPE = 0;
/** Single byte token type */
static final int SINGLE_TOKEN_TYPE = 1;
/** Double byte token type */
static final int DOUBLE_TOKEN_TYPE = 2;
/** Names for token types */
static final String[] TOKEN_TYPE_NAMES = { "word", "single", "double" };
/** Max word length */
private static final int MAX_WORD_LEN = 255;
/** buffer size: */
private static final int IO_BUFFER_SIZE = 256;
//~ Instance fields --------------------------------------------------------
/** word offset, used to imply which character(in ) is parsed */
private int offset = 0;
/** the index used only for ioBuffer */
private int bufferIndex = 0;
/** data length */
private int dataLen = 0;
/**
* character buffer, store the characters which are used to compose <br>
* the returned Token
*/
private final char[] buffer = new char[MAX_WORD_LEN];
/**
* I/O buffer, used to store the content of the input(one of the <br>
* members of Tokenizer)
*/
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
/** word type: single=>ASCII double=>non-ASCII word=>default */
private int tokenType = WORD_TYPE;
/**
* tag: previous character is a cached double-byte character "C1C2C3C4"
* ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
* C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
*/
private boolean preIsTokened = false;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
//~ Constructors -----------------------------------------------------------
/**
* Construct a token stream processing the given input.
*
* @param in I/O reader
*/
public CJKTokenizer(Reader in) {
super(in);
}
public CJKTokenizer(AttributeSource source, Reader in) {
super(source, in);
}
public CJKTokenizer(AttributeFactory factory, Reader in) {
super(factory, in);
}
//~ Methods ----------------------------------------------------------------
/**
* Returns true for the next token in the stream, or false at EOS.
* See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
* for detail.
*
* @return false for end of stream, true otherwise
*
* @throws IOException - throw IOException when read error <br>
* happened in the InputStream
*
*/
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
/** how many character(s) has been stored in buffer */
while(true) { // loop until we find a non-empty token
int length = 0;
/** the position used to create Token */
int start = offset;
while (true) { // loop until we've found a full token
/** current character */
char c;
/** unicode block of current character for detail */
Character.UnicodeBlock ub;
offset++;
if (bufferIndex >= dataLen) {
dataLen = input.read(ioBuffer);
bufferIndex = 0;
}
if (dataLen == -1) {
if (length > 0) {
if (preIsTokened == true) {
length = 0;
preIsTokened = false;
}
else{
offset--;
}
break;
} else {
offset--;
return false;
}
} else {
//get current character
c = ioBuffer[bufferIndex++];
//get the UnicodeBlock of the current character
ub = Character.UnicodeBlock.of(c);
}
//if the current character is ASCII or Extend ASCII
if ((ub == Character.UnicodeBlock.BASIC_LATIN)
|| (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
) {
if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
int i = (int) c;
if (i >= 65281 && i <= 65374) {
// convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
i = i - 65248;
c = (char) i;
}
}
// if the current character is a letter or "_" "+" "#"
if (Character.isLetterOrDigit(c)
|| ((c == '_') || (c == '+') || (c == '#'))
) {
if (length == 0) {
// "javaC1C2C3C4linux" <br>
// ^--: the current character begin to token the ASCII
// letter
start = offset - 1;
} else if (tokenType == DOUBLE_TOKEN_TYPE) {
// "javaC1C2C3C4linux" <br>
// ^--: the previous non-ASCII
// : the current character
offset--;
bufferIndex--;
if (preIsTokened == true) {
// there is only one non-ASCII has been stored
length = 0;
preIsTokened = false;
break;
} else {
break;
}
}
// store the LowerCase(c) in the buffer
buffer[length++] = Character.toLowerCase(c);
tokenType = SINGLE_TOKEN_TYPE;
// break the procedure if buffer overflowed!
if (length == MAX_WORD_LEN) {
break;
}
} else if (length > 0) {
if (preIsTokened == true) {
length = 0;
preIsTokened = false;
} else {
break;
}
}
} else {
// non-ASCII letter, e.g."C1C2C3C4"
if (Character.isLetter(c)) {
if (length == 0) {
start = offset - 1;
buffer[length++] = c;
tokenType = DOUBLE_TOKEN_TYPE;
} else {
if (tokenType == SINGLE_TOKEN_TYPE) {
offset--;
bufferIndex--;
//return the previous ASCII characters
break;
} else {
buffer[length++] = c;
tokenType = DOUBLE_TOKEN_TYPE;
if (length == 2) {
offset--;
bufferIndex--;
preIsTokened = true;
break;
}
}
}
} else if (length > 0) {
if (preIsTokened == true) {
// empty the buffer
length = 0;
preIsTokened = false;
} else {
break;
}
}
}
}
if (length > 0) {
termAtt.copyBuffer(buffer, 0, length);
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
typeAtt.setType(TOKEN_TYPE_NAMES[tokenType]);
return true;
} else if (dataLen == -1) {
offset--;
return false;
}
// Cycle back and try for the next token (don't
// return an empty string)
}
}
@Override
public final void end() {
// set final offset
final int finalOffset = correctOffset(offset);
this.offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public void reset() throws IOException {
super.reset();
offset = bufferIndex = dataLen = 0;
preIsTokened = false;
tokenType = WORD_TYPE;
}
}

41
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKTokenizerFactory.java

@ -0,0 +1,41 @@
package com.fr.third.org.apache.lucene.analysis.cjk;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.fr.third.org.apache.lucene.analysis.cjk.CJKTokenizer;
import com.fr.third.org.apache.lucene.analysis.util.TokenizerFactory;
import java.io.Reader;
/**
* Factory for {@link CJKTokenizer}.
* <pre class="prettyprint" >
* &lt;fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.CJKTokenizerFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
* @deprecated Use {@link CJKBigramFilterFactory} instead.
*/
@Deprecated
public class CJKTokenizerFactory extends TokenizerFactory {
public CJKTokenizer create(Reader in) {
return new CJKTokenizer(in);
}
}

112
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKWidthFilter.java

@ -0,0 +1,112 @@
package com.fr.third.org.apache.lucene.analysis.cjk;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import com.fr.third.org.apache.lucene.analysis.TokenFilter;
import com.fr.third.org.apache.lucene.analysis.TokenStream;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import com.fr.third.org.apache.lucene.analysis.util.StemmerUtil;
/**
* A {@link TokenFilter} that normalizes CJK width differences:
* <ul>
* <li>Folds fullwidth ASCII variants into the equivalent basic latin
* <li>Folds halfwidth Katakana variants into the equivalent kana
* </ul>
* <p>
* NOTE: this filter can be viewed as a (practical) subset of NFKC/NFKD
* Unicode normalization. See the normalization support in the ICU package
* for full normalization.
*/
public final class CJKWidthFilter extends TokenFilter {
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/* halfwidth kana mappings: 0xFF65-0xFF9D
*
* note: 0xFF9C and 0xFF9D are only mapped to 0x3099 and 0x309A
* as a fallback when they cannot properly combine with a preceding
* character into a composed form.
*/
private static final char KANA_NORM[] = new char[] {
0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5,
0x30e7, 0x30c3, 0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab,
0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd,
0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd,
0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, 0x30df, 0x30e0,
0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec,
0x30ed, 0x30ef, 0x30f3, 0x3099, 0x309A
};
public CJKWidthFilter(TokenStream input) {
super(input);
}
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
char text[] = termAtt.buffer();
int length = termAtt.length();
for (int i = 0; i < length; i++) {
final char ch = text[i];
if (ch >= 0xFF01 && ch <= 0xFF5E) {
// Fullwidth ASCII variants
text[i] -= 0xFEE0;
} else if (ch >= 0xFF65 && ch <= 0xFF9F) {
// Halfwidth Katakana variants
if ((ch == 0xFF9E || ch == 0xFF9F) && i > 0 && combine(text, i, ch)) {
length = StemmerUtil.delete(text, i--, length);
} else {
text[i] = KANA_NORM[ch - 0xFF65];
}
}
}
termAtt.setLength(length);
return true;
} else {
return false;
}
}
/* kana combining diffs: 0x30A6-0x30FD */
private static final byte KANA_COMBINE_VOICED[] = new byte[] {
78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
};
private static final byte KANA_COMBINE_HALF_VOICED[] = new byte[] {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2,
0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
/** returns true if we successfully combined the voice mark */
private static boolean combine(char text[], int pos, char ch) {
final char prev = text[pos-1];
if (prev >= 0x30A6 && prev <= 0x30FD) {
text[pos-1] += (ch == 0xFF9F)
? KANA_COMBINE_HALF_VOICED[prev - 0x30A6]
: KANA_COMBINE_VOICED[prev - 0x30A6];
return text[pos-1] != prev;
}
return false;
}
}

50
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKWidthFilterFactory.java

@ -0,0 +1,50 @@
package com.fr.third.org.apache.lucene.analysis.cjk;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.fr.third.org.apache.lucene.analysis.TokenStream;
import com.fr.third.org.apache.lucene.analysis.cjk.CJKWidthFilter;
import com.fr.third.org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import com.fr.third.org.apache.lucene.analysis.util.MultiTermAwareComponent;
import com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link CJKWidthFilter}.
* <pre class="prettyprint" >
* &lt;fieldType name="text_cjk" class="solr.TextField"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.CJKWidthFilterFactory"/&gt;
* &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.CJKBigramFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class CJKWidthFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
@Override
public TokenStream create(TokenStream input) {
return new CJKWidthFilter(input);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
}
}

42
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/package.html

@ -0,0 +1,42 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
</head>
<body>
Analyzer for Chinese, Japanese, and Korean, which indexes bigrams.
This analyzer generates bigram terms, which are overlapping groups of two adjacent Han, Hiragana, Katakana, or Hangul characters.
<p>
Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
<ul>
<li>ChineseAnalyzer (in the analyzers/cn package): Index unigrams (individual Chinese characters) as a token.
<li>CJKAnalyzer (in this package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
<li>SmartChineseAnalyzer (in the analyzers/smartcn package): Index words (attempt to segment Chinese text into words) as tokens.
</ul>
Example phrase: "我是中国人"
<ol>
<li>ChineseAnalyzer: 我-是-中-国-人</li>
<li>CJKAnalyzer: 我是-是中-中国-国人</li>
<li>SmartChineseAnalyzer: 我-是-中国-人</li>
</ol>
</p>
</body>
</html>

50
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/ChineseAnalyzer.java

@ -0,0 +1,50 @@
package com.fr.third.org.apache.lucene.analysis.cn;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import com.fr.third.org.apache.lucene.analysis.standard.StandardAnalyzer; // javadoc @link
import com.fr.third.org.apache.lucene.analysis.Analyzer;
import com.fr.third.org.apache.lucene.analysis.Tokenizer;
/**
* An {@link Analyzer} that tokenizes text with {@link ChineseTokenizer} and
* filters with {@link ChineseFilter}
* @deprecated (3.1) Use {@link StandardAnalyzer} instead, which has the same functionality.
* This analyzer will be removed in Lucene 5.0
*/
@Deprecated
public final class ChineseAnalyzer extends Analyzer {
/**
* Creates
* {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* used to tokenize all the text in the provided {@link Reader}.
*
* @return {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from a {@link ChineseTokenizer} filtered with
* {@link ChineseFilter}
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new ChineseTokenizer(reader);
return new TokenStreamComponents(source, new ChineseFilter(source));
}
}

104
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/ChineseFilter.java

@ -0,0 +1,104 @@
package com.fr.third.org.apache.lucene.analysis.cn;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import com.fr.third.org.apache.lucene.analysis.TokenFilter;
import com.fr.third.org.apache.lucene.analysis.TokenStream;
import com.fr.third.org.apache.lucene.analysis.core.StopFilter;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import com.fr.third.org.apache.lucene.analysis.util.CharArraySet;
import com.fr.third.org.apache.lucene.util.Version;
/**
* A {@link TokenFilter} with a stop word table.
* <ul>
* <li>Numeric tokens are removed.
* <li>English tokens must be larger than 1 character.
* <li>One Chinese character as one Chinese word.
* </ul>
* TO DO:
* <ol>
* <li>Add Chinese stop words, such as \ue400
* <li>Dictionary based Chinese word extraction
* <li>Intelligent Chinese word extraction
* </ol>
*
* @deprecated (3.1) Use {@link StopFilter} instead, which has the same functionality.
* This filter will be removed in Lucene 5.0
*/
@Deprecated
public final class ChineseFilter extends TokenFilter {
// Only English now, Chinese to be added later.
public static final String[] STOP_WORDS = {
"and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
};
private CharArraySet stopTable;
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
public ChineseFilter(TokenStream in) {
super(in);
stopTable = new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS), false);
}
@Override
public boolean incrementToken() throws IOException {
while (input.incrementToken()) {
char text[] = termAtt.buffer();
int termLength = termAtt.length();
// why not key off token type here assuming ChineseTokenizer comes first?
if (!stopTable.contains(text, 0, termLength)) {
switch (Character.getType(text[0])) {
case Character.LOWERCASE_LETTER:
case Character.UPPERCASE_LETTER:
// English word/token should larger than 1 character.
if (termLength>1) {
return true;
}
break;
case Character.OTHER_LETTER:
// One Chinese character as one Chinese word.
// Chinese word extraction to be added later here.
return true;
}
}
}
return false;
}
}

36
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/ChineseFilterFactory.java

@ -0,0 +1,36 @@
package com.fr.third.org.apache.lucene.analysis.cn;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.fr.third.org.apache.lucene.analysis.TokenStream;
import com.fr.third.org.apache.lucene.analysis.cn.ChineseFilter;
import com.fr.third.org.apache.lucene.analysis.core.StopFilterFactory; // javadocs
import com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link ChineseFilter}
* @deprecated Use {@link StopFilterFactory} instead.
*/
@Deprecated
public class ChineseFilterFactory extends TokenFilterFactory {
public ChineseFilter create(TokenStream in) {
return new ChineseFilter(in);
}
}

169
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/ChineseTokenizer.java

@ -0,0 +1,169 @@
package com.fr.third.org.apache.lucene.analysis.cn;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer;
import com.fr.third.org.apache.lucene.analysis.Tokenizer;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import com.fr.third.org.apache.lucene.util.AttributeSource;
/**
* Tokenize Chinese text as individual chinese characters.
*
* <p>
* The difference between ChineseTokenizer and
* CJKTokenizer is that they have different
* token parsing logic.
* </p>
* <p>
* For example, if the Chinese text
* "C1C2C3C4" is to be indexed:
* <ul>
* <li>The tokens returned from ChineseTokenizer are C1, C2, C3, C4.
* <li>The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.
* </ul>
* </p>
* <p>
* Therefore the index created by CJKTokenizer is much larger.
* </p>
* <p>
* The problem is that when searching for C1, C1C2, C1C3,
* C4C2, C1C2C3 ... the ChineseTokenizer works, but the
* CJKTokenizer will not work.
* </p>
* @deprecated (3.1) Use {@link StandardTokenizer} instead, which has the same functionality.
* This filter will be removed in Lucene 5.0
*/
@Deprecated
public final class ChineseTokenizer extends Tokenizer {
public ChineseTokenizer(Reader in) {
super(in);
}
public ChineseTokenizer(AttributeSource source, Reader in) {
super(source, in);
}
public ChineseTokenizer(AttributeFactory factory, Reader in) {
super(factory, in);
}
private int offset = 0, bufferIndex=0, dataLen=0;
private final static int MAX_WORD_LEN = 255;
private final static int IO_BUFFER_SIZE = 1024;
private final char[] buffer = new char[MAX_WORD_LEN];
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
private int length;
private int start;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final void push(char c) {
if (length == 0) start = offset-1; // start of token
buffer[length++] = Character.toLowerCase(c); // buffer it
}
private final boolean flush() {
if (length>0) {
//System.out.println(new String(buffer, 0,
//length));
termAtt.copyBuffer(buffer, 0, length);
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
return true;
}
else
return false;
}
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
length = 0;
start = offset;
while (true) {
final char c;
offset++;
if (bufferIndex >= dataLen) {
dataLen = input.read(ioBuffer);
bufferIndex = 0;
}
if (dataLen == -1) {
offset--;
return flush();
} else
c = ioBuffer[bufferIndex++];
switch(Character.getType(c)) {
case Character.DECIMAL_DIGIT_NUMBER:
case Character.LOWERCASE_LETTER:
case Character.UPPERCASE_LETTER:
push(c);
if (length == MAX_WORD_LEN) return flush();
break;
case Character.OTHER_LETTER:
if (length>0) {
bufferIndex--;
offset--;
return flush();
}
push(c);
return flush();
default:
if (length>0) return flush();
break;
}
}
}
@Override
public final void end() {
// set final offset
final int finalOffset = correctOffset(offset);
this.offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public void reset() throws IOException {
super.reset();
offset = bufferIndex = dataLen = 0;
}
}

37
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/ChineseTokenizerFactory.java

@ -0,0 +1,37 @@
package com.fr.third.org.apache.lucene.analysis.cn;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import com.fr.third.org.apache.lucene.analysis.cn.ChineseTokenizer;
import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizerFactory; // javadocs
import com.fr.third.org.apache.lucene.analysis.util.TokenizerFactory;
/**
* Factory for {@link ChineseTokenizer}
* @deprecated Use {@link StandardTokenizerFactory} instead.
*/
@Deprecated
public class ChineseTokenizerFactory extends TokenizerFactory {
public ChineseTokenizer create(Reader in) {
return new ChineseTokenizer(in);
}
}

41
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/package.html

@ -0,0 +1,41 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
</head>
<body>
Analyzer for Chinese, which indexes unigrams (individual chinese characters).
<p>
Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
<ul>
<li>StandardAnalyzer: Index unigrams (individual Chinese characters) as a token.
<li>CJKAnalyzer (in the analyzers/cjk package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
<li>SmartChineseAnalyzer (in the analyzers/smartcn package): Index words (attempt to segment Chinese text into words) as tokens.
</ul>
Example phrase: "我是中国人"
<ol>
<li>StandardAnalyzer: 我-是-中-国-人</li>
<li>CJKAnalyzer: 我是-是中-中国-国人</li>
<li>SmartChineseAnalyzer: 我-是-中国-人</li>
</ol>
</p>
</body>
</html>

176
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java

@ -0,0 +1,176 @@
/*
* Licensed under the Apache License,
* Version 2.0 (the "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
*/
package com.fr.third.org.apache.lucene.analysis.commongrams;
import java.io.IOException;
import com.fr.third.org.apache.lucene.analysis.TokenFilter;
import com.fr.third.org.apache.lucene.analysis.TokenStream;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import com.fr.third.org.apache.lucene.analysis.util.CharArraySet;
import com.fr.third.org.apache.lucene.util.Version;
/*
* TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors
*/
/**
* Construct bigrams for frequently occurring terms while indexing. Single terms
* are still indexed too, with bigrams overlaid. This is achieved through the
* use of {@link PositionIncrementAttribute#setPositionIncrement(int)}. Bigrams have a type
* of {@link #GRAM_TYPE} Example:
* <ul>
* <li>input:"the quick brown fox"</li>
* <li>output:|"the","the-quick"|"brown"|"fox"|</li>
* <li>"the-quick" has a position increment of 0 so it is in the same position
* as "the" "the-quick" has a term.type() of "gram"</li>
*
* </ul>
*/
/*
* Constructors and makeCommonSet based on similar code in StopFilter
*/
public final class CommonGramsFilter extends TokenFilter {
public static final String GRAM_TYPE = "gram";
private static final char SEPARATOR = '_';
private final CharArraySet commonWords;
private final StringBuilder buffer = new StringBuilder();
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class);
private int lastStartOffset;
private boolean lastWasCommon;
private State savedState;
/**
* Construct a token stream filtering the given input using a Set of common
* words to create bigrams. Outputs both unigrams with position increment and
* bigrams with position increment 0 type=gram where one or both of the words
* in a potential bigram are in the set of common words .
*
* @param input TokenStream input in filter chain
* @param commonWords The set of common words.
*/
public CommonGramsFilter(Version matchVersion, TokenStream input, CharArraySet commonWords) {
super(input);
this.commonWords = commonWords;
}
/**
* Inserts bigrams for common words into a token stream. For each input token,
* output the token. If the token and/or the following token are in the list
* of common words also output a bigram with position increment 0 and
* type="gram"
*
* TODO:Consider adding an option to not emit unigram stopwords
* as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be
* changed to work with this.
*
* TODO: Consider optimizing for the case of three
* commongrams i.e "man of the year" normally produces 3 bigrams: "man-of",
* "of-the", "the-year" but with proper management of positions we could
* eliminate the middle bigram "of-the"and save a disk seek and a whole set of
* position lookups.
*/
@Override
public boolean incrementToken() throws IOException {
// get the next piece of input
if (savedState != null) {
restoreState(savedState);
savedState = null;
saveTermBuffer();
return true;
} else if (!input.incrementToken()) {
return false;
}
/* We build n-grams before and after stopwords.
* When valid, the buffer always contains at least the separator.
* If its empty, there is nothing before this stopword.
*/
if (lastWasCommon || (isCommon() && buffer.length() > 0)) {
savedState = captureState();
gramToken();
return true;
}
saveTermBuffer();
return true;
}
/**
* {@inheritDoc}
*/
@Override
public void reset() throws IOException {
super.reset();
lastWasCommon = false;
savedState = null;
buffer.setLength(0);
}
// ================================================= Helper Methods ================================================
/**
* Determines if the current token is a common term
*
* @return {@code true} if the current token is a common term, {@code false} otherwise
*/
private boolean isCommon() {
return commonWords != null && commonWords.contains(termAttribute.buffer(), 0, termAttribute.length());
}
/**
* Saves this information to form the left part of a gram
*/
private void saveTermBuffer() {
buffer.setLength(0);
buffer.append(termAttribute.buffer(), 0, termAttribute.length());
buffer.append(SEPARATOR);
lastStartOffset = offsetAttribute.startOffset();
lastWasCommon = isCommon();
}
/**
* Constructs a compound token.
*/
private void gramToken() {
buffer.append(termAttribute.buffer(), 0, termAttribute.length());
int endOffset = offsetAttribute.endOffset();
clearAttributes();
int length = buffer.length();
char termText[] = termAttribute.buffer();
if (length > termText.length) {
termText = termAttribute.resizeBuffer(length);
}
buffer.getChars(0, length, termText, 0);
termAttribute.setLength(length);
posIncAttribute.setPositionIncrement(0);
posLenAttribute.setPositionLength(2); // bigram
offsetAttribute.setOffset(lastStartOffset, endOffset);
typeAttribute.setType(GRAM_TYPE);
buffer.setLength(0);
}
}

79
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java

@ -0,0 +1,79 @@
package com.fr.third.org.apache.lucene.analysis.commongrams;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import com.fr.third.org.apache.lucene.analysis.TokenStream;
import com.fr.third.org.apache.lucene.analysis.commongrams.CommonGramsFilter;
import com.fr.third.org.apache.lucene.analysis.core.StopAnalyzer;
import com.fr.third.org.apache.lucene.analysis.util.*;
/**
* Constructs a {@link CommonGramsFilter}.
* <pre class="prettyprint" >
* &lt;fieldType name="text_cmmngrms" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.CommonGramsFilterFactory" words="commongramsstopwords.txt" ignoreCase="false"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
*/
/*
* This is pretty close to a straight copy from StopFilterFactory
*/
public class CommonGramsFilterFactory extends TokenFilterFactory implements
ResourceLoaderAware {
public void inform(ResourceLoader loader) throws IOException {
String commonWordFiles = args.get("words");
ignoreCase = getBoolean("ignoreCase", false);
if (commonWordFiles != null) {
if ("snowball".equalsIgnoreCase(args.get("format"))) {
commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
} else {
commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
}
} else {
commonWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
}
}
//Force the use of a char array set, as it is the most performant, although this may break things if Lucene ever goes away from it. See SOLR-1095
private CharArraySet commonWords;
private boolean ignoreCase;
public boolean isIgnoreCase() {
return ignoreCase;
}
public CharArraySet getCommonWords() {
return commonWords;
}
public CommonGramsFilter create(TokenStream input) {
CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords);
return commonGrams;
}
}

126
fine-lucene/src/com/fr/third/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilter.java

@ -0,0 +1,126 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.fr.third.org.apache.lucene.analysis.commongrams;
import java.io.IOException;
import com.fr.third.org.apache.lucene.analysis.TokenFilter;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import com.fr.third.org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import static com.fr.third.org.apache.lucene.analysis.commongrams.CommonGramsFilter.GRAM_TYPE;
/**
* Wrap a CommonGramsFilter optimizing phrase queries by only returning single
* words when they are not a member of a bigram.
*
* Example:
* <ul>
* <li>query input to CommonGramsFilter: "the rain in spain falls mainly"
* <li>output of CommomGramsFilter/input to CommonGramsQueryFilter:
* |"the, "the-rain"|"rain" "rain-in"|"in, "in-spain"|"spain"|"falls"|"mainly"
* <li>output of CommonGramsQueryFilter:"the-rain", "rain-in" ,"in-spain",
* "falls", "mainly"
* </ul>
*/
/*
* See:http://hudson.zones.apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache/lucene/analysis/TokenStream.html and
* http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/analysis/package.html?revision=718798
*/
public final class CommonGramsQueryFilter extends TokenFilter {
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
private State previous;
private String previousType;
private boolean exhausted;
/**
* Constructs a new CommonGramsQueryFilter based on the provided CommomGramsFilter
*
* @param input CommonGramsFilter the QueryFilter will use
*/
public CommonGramsQueryFilter(CommonGramsFilter input) {
super(input);
}
/**
* {@inheritDoc}
*/
@Override
public void reset() throws IOException {
super.reset();
previous = null;
previousType = null;
exhausted = false;
}
/**
* Output bigrams whenever possible to optimize queries. Only output unigrams
* when they are not a member of a bigram. Example:
* <ul>
* <li>input: "the rain in spain falls mainly"
* <li>output:"the-rain", "rain-in" ,"in-spain", "falls", "mainly"
* </ul>
*/
@Override
public boolean incrementToken() throws IOException {
while (!exhausted && input.incrementToken()) {
State current = captureState();
if (previous != null && !isGramType()) {
restoreState(previous);
previous = current;
previousType = typeAttribute.type();
if (isGramType()) {
posIncAttribute.setPositionIncrement(1);
}
return true;
}
previous = current;
}
exhausted = true;
if (previous == null || GRAM_TYPE.equals(previousType)) {
return false;
}
restoreState(previous);
previous = null;
if (isGramType()) {
posIncAttribute.setPositionIncrement(1);
}
return true;
}
// ================================================= Helper Methods ================================================
/**
* Convenience method to check if the current type is a gram type
*
* @return {@code true} if the current type is a gram type, {@code false} otherwise
*/
public boolean isGramType() {
return GRAM_TYPE.equals(typeAttribute.type());
}
}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save