lucene4.0 改包名

7 years ago · d75ef79fe8
1152 changed files with 284133 additions and 1 deletions
--- a/build.third_step6.gradle
+++ b/build.third_step6.gradle
@ -24,8 +24,10 @@ sourceSets{
 				"${srcDir}/fine-jackson/src",
 				"${srcDir}/fine-jackson/resources",
 				"${srcDir}/fine-ehcache/src",
-				"${srcDir}/fine-ehcache/resources"
+				"${srcDir}/fine-ehcache/resources",
 				"${srcDir}/fine-guava/src",
+				"${srcDir}/fine-lucene/src",
+				"${srcDir}/fine-lucene/resources",
 				]
 		}
 	}
@ -72,6 +74,8 @@ task copyFiles(type:Copy,dependsOn:'compileJava'){
 		with dataContent.call("${srcDir}/fine-ehcache/src")
 		with dataContent.call("${srcDir}/fine-ehcache/resources")
 		with dataContent.call("${srcDir}/fine-guava/src")
+		with dataContent.call("${srcDir}/fine-lucene/src")
+		with dataContent.call("${srcDir}/fine-lucene/resources")
 		into "${classesDir}"
 	}
 }
--- a/fine-lucene/resources/META-INF/services/com.fr.third.org.apache.lucene.analysis.util.CharFilterFactory
+++ b/fine-lucene/resources/META-INF/services/com.fr.third.org.apache.lucene.analysis.util.CharFilterFactory
@ -0,0 +1,19 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+com.fr.third.org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory
+com.fr.third.org.apache.lucene.analysis.charfilter.MappingCharFilterFactory
+com.fr.third.org.apache.lucene.analysis.fa.PersianCharFilterFactory
+com.fr.third.org.apache.lucene.analysis.pattern.PatternReplaceCharFilterFactory
--- a/fine-lucene/resources/META-INF/services/com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/fine-lucene/resources/META-INF/services/com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory
@ -0,0 +1,92 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+com.fr.third.org.apache.lucene.analysis.ar.ArabicNormalizationFilterFactory
+com.fr.third.org.apache.lucene.analysis.ar.ArabicStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.bg.BulgarianStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.br.BrazilianStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.cjk.CJKBigramFilterFactory
+com.fr.third.org.apache.lucene.analysis.cjk.CJKWidthFilterFactory
+com.fr.third.org.apache.lucene.analysis.cn.ChineseFilterFactory
+com.fr.third.org.apache.lucene.analysis.commongrams.CommonGramsFilterFactory
+com.fr.third.org.apache.lucene.analysis.commongrams.CommonGramsQueryFilterFactory
+com.fr.third.org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilterFactory
+com.fr.third.org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilterFactory
+com.fr.third.org.apache.lucene.analysis.core.LowerCaseFilterFactory
+com.fr.third.org.apache.lucene.analysis.core.StopFilterFactory
+com.fr.third.org.apache.lucene.analysis.core.TypeTokenFilterFactory
+com.fr.third.org.apache.lucene.analysis.cz.CzechStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.de.GermanLightStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.de.GermanMinimalStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.de.GermanNormalizationFilterFactory
+com.fr.third.org.apache.lucene.analysis.de.GermanStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.el.GreekLowerCaseFilterFactory
+com.fr.third.org.apache.lucene.analysis.el.GreekStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.en.EnglishMinimalStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.en.EnglishPossessiveFilterFactory
+com.fr.third.org.apache.lucene.analysis.en.KStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.en.PorterStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.es.SpanishLightStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory
+com.fr.third.org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.fr.FrenchMinimalStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.ga.IrishLowerCaseFilterFactory
+com.fr.third.org.apache.lucene.analysis.gl.GalicianMinimalStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.gl.GalicianStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.hi.HindiNormalizationFilterFactory
+com.fr.third.org.apache.lucene.analysis.hi.HindiStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.hu.HungarianLightStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.hunspell.HunspellStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.id.IndonesianStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.in.IndicNormalizationFilterFactory
+com.fr.third.org.apache.lucene.analysis.it.ItalianLightStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.lv.LatvianStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory
+com.fr.third.org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
+com.fr.third.org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory
+com.fr.third.org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory
+com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory
+com.fr.third.org.apache.lucene.analysis.miscellaneous.LengthFilterFactory
+com.fr.third.org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory
+com.fr.third.org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory
+com.fr.third.org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory
+com.fr.third.org.apache.lucene.analysis.miscellaneous.TrimFilterFactory
+com.fr.third.org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory
+com.fr.third.org.apache.lucene.analysis.ngram.EdgeNGramFilterFactory
+com.fr.third.org.apache.lucene.analysis.ngram.NGramFilterFactory
+com.fr.third.org.apache.lucene.analysis.no.NorwegianLightStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.no.NorwegianMinimalStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory
+com.fr.third.org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory
+com.fr.third.org.apache.lucene.analysis.payloads.NumericPayloadTokenFilterFactory
+com.fr.third.org.apache.lucene.analysis.payloads.TokenOffsetPayloadTokenFilterFactory
+com.fr.third.org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory
+com.fr.third.org.apache.lucene.analysis.position.PositionFilterFactory
+com.fr.third.org.apache.lucene.analysis.pt.PortugueseLightStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.pt.PortugueseMinimalStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.pt.PortugueseStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.reverse.ReverseStringFilterFactory
+com.fr.third.org.apache.lucene.analysis.ru.RussianLightStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.shingle.ShingleFilterFactory
+com.fr.third.org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory
+com.fr.third.org.apache.lucene.analysis.standard.ClassicFilterFactory
+com.fr.third.org.apache.lucene.analysis.standard.StandardFilterFactory
+com.fr.third.org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory
+com.fr.third.org.apache.lucene.analysis.synonym.SynonymFilterFactory
+com.fr.third.org.apache.lucene.analysis.th.ThaiWordFilterFactory
+com.fr.third.org.apache.lucene.analysis.tr.TurkishLowerCaseFilterFactory
+com.fr.third.org.apache.lucene.analysis.util.ElisionFilterFactory
+com.fr.third.org.apache.lucene.collation.CollationKeyFilterFactory
--- a/fine-lucene/resources/META-INF/services/com.fr.third.org.apache.lucene.analysis.util.TokenizerFactory
+++ b/fine-lucene/resources/META-INF/services/com.fr.third.org.apache.lucene.analysis.util.TokenizerFactory
@ -0,0 +1,31 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+com.fr.third.org.apache.lucene.analysis.ar.ArabicLetterTokenizerFactory
+com.fr.third.org.apache.lucene.analysis.cjk.CJKTokenizerFactory
+com.fr.third.org.apache.lucene.analysis.cn.ChineseTokenizerFactory
+com.fr.third.org.apache.lucene.analysis.core.KeywordTokenizerFactory
+com.fr.third.org.apache.lucene.analysis.core.LetterTokenizerFactory
+com.fr.third.org.apache.lucene.analysis.core.LowerCaseTokenizerFactory
+com.fr.third.org.apache.lucene.analysis.core.WhitespaceTokenizerFactory
+com.fr.third.org.apache.lucene.analysis.ngram.EdgeNGramTokenizerFactory
+com.fr.third.org.apache.lucene.analysis.ngram.NGramTokenizerFactory
+com.fr.third.org.apache.lucene.analysis.path.PathHierarchyTokenizerFactory
+com.fr.third.org.apache.lucene.analysis.pattern.PatternTokenizerFactory
+com.fr.third.org.apache.lucene.analysis.ru.RussianLetterTokenizerFactory
+com.fr.third.org.apache.lucene.analysis.standard.ClassicTokenizerFactory
+com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizerFactory
+com.fr.third.org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory
+com.fr.third.org.apache.lucene.analysis.wikipedia.WikipediaTokenizerFactory
--- a/fine-lucene/resources/META-INF/services/com.fr.third.org.apache.lucene.codecs.Codec
+++ b/fine-lucene/resources/META-INF/services/com.fr.third.org.apache.lucene.codecs.Codec
@ -0,0 +1,17 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+com.fr.third.org.apache.lucene.codecs.lucene40.Lucene40Codec
+com.fr.third.org.apache.lucene.codecs.lucene3x.Lucene3xCodec
--- a/fine-lucene/resources/META-INF/services/com.fr.third.org.apache.lucene.codecs.PostingsFormat
+++ b/fine-lucene/resources/META-INF/services/com.fr.third.org.apache.lucene.codecs.PostingsFormat
@ -0,0 +1,16 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+com.fr.third.org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/ar/stopwords.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/ar/stopwords.txt
@ -0,0 +1,125 @@
+# This file was created by Jacques Savoy and is distributed under the BSD license.
+# See http://members.unine.ch/jacques.savoy/clef/index.html.
+# Also see http://www.opensource.org/licenses/bsd-license.html
+# Cleaned on October 11, 2009 (not normalized, so use before normalization)
+# This means that when modifying this list, you might need to add some 
+# redundant entries, for example containing forms with both أ and ا
+من
+ومن
+منها
+منه
+في
+وفي
+فيها
+فيه
+و
+ف
+ثم
+او
+أو
+ب
+بها
+به
+ا
+أ
+اى
+اي
+أي
+أى
+لا
+ولا
+الا
+ألا
+إلا
+لكن
+ما
+وما
+كما
+فما
+عن
+مع
+اذا
+إذا
+ان
+أن
+إن
+انها
+أنها
+إنها
+انه
+أنه
+إنه
+بان
+بأن
+فان
+فأن
+وان
+وأن
+وإن
+التى
+التي
+الذى
+الذي
+الذين
+الى
+الي
+إلى
+إلي
+على
+عليها
+عليه
+اما
+أما
+إما
+ايضا
+أيضا
+كل
+وكل
+لم
+ولم
+لن
+ولن
+هى
+هي
+هو
+وهى
+وهي
+وهو
+فهى
+فهي
+فهو
+انت
+أنت
+لك
+لها
+له
+هذه
+هذا
+تلك
+ذلك
+هناك
+كانت
+كان
+يكون
+تكون
+وكانت
+وكان
+غير
+بعض
+قد
+نحو
+بين
+بينما
+منذ
+ضمن
+حيث
+الان
+الآن
+خلال
+بعد
+قبل
+حتى
+عند
+عندما
+لدى
+جميع
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/bg/stopwords.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/bg/stopwords.txt
@ -0,0 +1,193 @@
+# This file was created by Jacques Savoy and is distributed under the BSD license.
+# See http://members.unine.ch/jacques.savoy/clef/index.html.
+# Also see http://www.opensource.org/licenses/bsd-license.html
+а
+аз
+ако
+ала
+бе
+без
+беше
+би
+бил
+била
+били
+било
+близо
+бъдат
+бъде
+бяха
+в
+вас
+ваш
+ваша
+вероятно
+вече
+взема
+ви
+вие
+винаги
+все
+всеки
+всички
+всичко
+всяка
+във
+въпреки
+върху
+г
+ги
+главно
+го
+д
+да
+дали
+до
+докато
+докога
+дори
+досега
+доста
+е
+едва
+един
+ето
+за
+зад
+заедно
+заради
+засега
+затова
+защо
+защото
+и
+из
+или
+им
+има
+имат
+иска
+й
+каза
+как
+каква
+какво
+както
+какъв
+като
+кога
+когато
+което
+които
+кой
+който
+колко
+която
+къде
+където
+към
+ли
+м
+ме
+между
+мен
+ми
+мнозина
+мога
+могат
+може
+моля
+момента
+му
+н
+на
+над
+назад
+най
+направи
+напред
+например
+нас
+не
+него
+нея
+ни
+ние
+никой
+нито
+но
+някои
+някой
+няма
+обаче
+около
+освен
+особено
+от
+отгоре
+отново
+още
+пак
+по
+повече
+повечето
+под
+поне
+поради
+после
+почти
+прави
+пред
+преди
+през
+при
+пък
+първо
+с
+са
+само
+се
+сега
+си
+скоро
+след
+сме
+според
+сред
+срещу
+сте
+съм
+със
+също
+т
+тази
+така
+такива
+такъв
+там
+твой
+те
+тези
+ти
+тн
+то
+това
+тогава
+този
+той
+толкова
+точно
+трябва
+тук
+тъй
+тя
+тях
+у
+харесва
+ч
+че
+често
+чрез
+ще
+щом
+я
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/br/stopwords.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/br/stopwords.txt
@ -0,0 +1,128 @@
+a
+ainda
+alem
+ambas
+ambos
+antes
+ao
+aonde
+aos
+apos
+aquele
+aqueles
+as
+assim
+com
+como
+contra
+contudo
+cuja
+cujas
+cujo
+cujos
+da
+das
+de
+dela
+dele
+deles
+demais
+depois
+desde
+desta
+deste
+dispoe
+dispoem
+diversa
+diversas
+diversos
+do
+dos
+durante
+e
+ela
+elas
+ele
+eles
+em
+entao
+entre
+essa
+essas
+esse
+esses
+esta
+estas
+este
+estes
+ha
+isso
+isto
+logo
+mais
+mas
+mediante
+menos
+mesma
+mesmas
+mesmo
+mesmos
+na
+nas
+nao
+nas
+nem
+nesse
+neste
+nos
+o
+os
+ou
+outra
+outras
+outro
+outros
+pelas
+pelas
+pelo
+pelos
+perante
+pois
+por
+porque
+portanto
+proprio
+propios
+quais
+qual
+qualquer
+quando
+quanto
+que
+quem
+quer
+se
+seja
+sem
+sendo
+seu
+seus
+sob
+sobre
+sua
+suas
+tal
+tambem
+teu
+teus
+toda
+todas
+todo
+todos
+tua
+tuas
+tudo
+um
+uma
+umas
+uns
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/ca/stopwords.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/ca/stopwords.txt
@ -0,0 +1,220 @@
+# Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed)
+a
+abans
+ací
+ah
+així
+això
+al
+als
+aleshores
+algun
+alguna
+algunes
+alguns
+alhora
+allà
+allí
+allò
+altra
+altre
+altres
+amb
+ambdós
+ambdues
+apa
+aquell
+aquella
+aquelles
+aquells
+aquest
+aquesta
+aquestes
+aquests
+aquí
+baix
+cada
+cadascú
+cadascuna
+cadascunes
+cadascuns
+com
+contra
+d'un
+d'una
+d'unes
+d'uns
+dalt
+de
+del
+dels
+des
+després
+dins
+dintre
+donat
+doncs
+durant
+e
+eh
+el
+els
+em
+en
+encara
+ens
+entre
+érem
+eren
+éreu
+es
+és
+esta
+està
+estàvem
+estaven
+estàveu
+esteu
+et
+etc
+ets
+fins
+fora
+gairebé
+ha
+han
+has
+havia
+he
+hem
+heu
+hi 
+ho
+i
+igual
+iguals
+ja
+l'hi
+la
+les
+li
+li'n
+llavors
+m'he
+ma
+mal
+malgrat
+mateix
+mateixa
+mateixes
+mateixos
+me
+mentre
+més
+meu
+meus
+meva
+meves
+molt
+molta
+moltes
+molts
+mon
+mons
+n'he
+n'hi
+ne
+ni
+no
+nogensmenys
+només
+nosaltres
+nostra
+nostre
+nostres
+o
+oh
+oi
+on
+pas
+pel
+pels
+per
+però
+perquè
+poc 
+poca
+pocs
+poques
+potser
+propi
+qual
+quals
+quan
+quant 
+que
+què
+quelcom
+qui
+quin
+quina
+quines
+quins
+s'ha
+s'han
+sa
+semblant
+semblants
+ses
+seu 
+seus
+seva
+seva
+seves
+si
+sobre
+sobretot
+sóc
+solament
+sols
+son 
+són
+sons 
+sota
+sou
+t'ha
+t'han
+t'he
+ta
+tal
+també
+tampoc
+tan
+tant
+tanta
+tantes
+teu
+teus
+teva
+teves
+ton
+tons
+tot
+tota
+totes
+tots
+un
+una
+unes
+uns
+us
+va
+vaig
+vam
+van
+vas
+veu
+vosaltres
+vostra
+vostre
+vostres
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/cjk/stopwords.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/cjk/stopwords.txt
@ -0,0 +1,35 @@
+a
+and
+are
+as
+at
+be
+but
+by
+for
+if
+in
+into
+is
+it
+no
+not
+of
+on
+or
+s
+such
+t
+that
+the
+their
+then
+there
+these
+they
+this
+to
+was
+will
+with
+www
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd
@ -0,0 +1,67 @@
+<?xml version="1.0" encoding="US-ASCII"?>
+<!--
+  Copyright 1999-2004 The Apache Software Foundation
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+
+<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,
+                           classes, exceptions?, patterns)>
+
+<!-- Hyphen character to be used in the exception list as shortcut for
+     <hyphen pre-break="-"/>. Defaults to '-'
+-->
+<!ELEMENT hyphen-char EMPTY>
+<!ATTLIST hyphen-char value CDATA #REQUIRED>
+
+<!-- Default minimun length in characters of hyphenated word fragments
+     before and after the line break. For some languages this is not
+     only for aesthetic purposes, wrong hyphens may be generated if this
+     is not accounted for.
+-->
+<!ELEMENT hyphen-min EMPTY>
+<!ATTLIST hyphen-min before CDATA #REQUIRED>
+<!ATTLIST hyphen-min after CDATA #REQUIRED>
+
+<!-- Character equivalent classes: space separated list of character groups, all
+     characters in a group are to be treated equivalent as far as
+     the hyphenation algorithm is concerned. The first character in a group
+     is the group's equivalent character. Patterns should only contain
+     first characters. It also defines word characters, i.e. a word that
+     contains characters not present in any of the classes is not hyphenated.
+-->
+<!ELEMENT classes (#PCDATA)>
+
+<!-- Hyphenation exceptions: space separated list of hyphenated words.
+     A hyphen is indicated by the hyphen tag, but you can use the
+     hyphen-char defined previously as shortcut. This is in cases
+     when the algorithm procedure finds wrong hyphens or you want
+     to provide your own hyphenation for some words.
+-->
+<!ELEMENT exceptions (#PCDATA|hyphen)* >
+
+<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'
+     characters as described before, between any two word characters a digit
+     in the range 0 to 9 may be specified. The absence of a digit is equivalent
+     to zero. The '.' character is reserved to indicate begining or ending
+     of words. -->
+<!ELEMENT patterns (#PCDATA)>
+
+<!-- A "full hyphen" equivalent to TeX's \discretionary
+     with pre-break, post-break and no-break attributes.
+     To be used in the exceptions list, the hyphen character is not
+     automatically added -->
+<!ELEMENT hyphen EMPTY>
+<!ATTLIST hyphen pre CDATA #IMPLIED>
+<!ATTLIST hyphen no CDATA #IMPLIED>
+<!ATTLIST hyphen post CDATA #IMPLIED>
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/cz/stopwords.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/cz/stopwords.txt
@ -0,0 +1,172 @@
+a
+s
+k
+o
+i
+u
+v
+z
+dnes
+cz
+tímto
+budeš
+budem
+byli
+jseš
+můj
+svým
+ta
+tomto
+tohle
+tuto
+tyto
+jej
+zda
+proč
+máte
+tato
+kam
+tohoto
+kdo
+kteří
+mi
+nám
+tom
+tomuto
+mít
+nic
+proto
+kterou
+byla
+toho
+protože
+asi
+ho
+naši
+napište
+re
+což
+tím
+takže
+svých
+její
+svými
+jste
+aj
+tu
+tedy
+teto
+bylo
+kde
+ke
+pravé
+ji
+nad
+nejsou
+či
+pod
+téma
+mezi
+přes
+ty
+pak
+vám
+ani
+když
+však
+neg
+jsem
+tento
+článku
+články
+aby
+jsme
+před
+pta
+jejich
+byl
+ještě
+až
+bez
+také
+pouze
+první
+vaše
+která
+nás
+nový
+tipy
+pokud
+může
+strana
+jeho
+své
+jiné
+zprávy
+nové
+není
+vás
+jen
+podle
+zde
+už
+být
+více
+bude
+již
+než
+který
+by
+které
+co
+nebo
+ten
+tak
+má
+při
+od
+po
+jsou
+jak
+další
+ale
+si
+se
+ve
+to
+jako
+za
+zpět
+ze
+do
+pro
+je
+na
+atd
+atp
+jakmile
+přičemž
+já
+on
+ona
+ono
+oni
+ony
+my
+vy
+jí
+ji
+mě
+mne
+jemu
+tomu
+těm
+těmu
+němu
+němuž
+jehož
+jíž
+jelikož
+jež
+jakož
+načež
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/el/stopwords.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/el/stopwords.txt
@ -0,0 +1,78 @@
+# Lucene Greek Stopwords list
+# Note: by default this file is used after GreekLowerCaseFilter,
+# so when modifying this file use 'σ' instead of 'ς' 
+ο
+η
+το
+οι
+τα
+του
+τησ
+των
+τον
+την
+και 
+κι
+κ
+ειμαι
+εισαι
+ειναι
+ειμαστε
+ειστε
+στο
+στον
+στη
+στην
+μα
+αλλα
+απο
+για
+προσ
+με
+σε
+ωσ
+παρα
+αντι
+κατα
+μετα
+θα
+να
+δε
+δεν
+μη
+μην
+επι
+ενω
+εαν
+αν
+τοτε
+που
+πωσ
+ποιοσ
+ποια
+ποιο
+ποιοι
+ποιεσ
+ποιων
+ποιουσ
+αυτοσ
+αυτη
+αυτο
+αυτοι
+αυτων
+αυτουσ
+αυτεσ
+αυτα
+εκεινοσ
+εκεινη
+εκεινο
+εκεινοι
+εκεινεσ
+εκεινα
+εκεινων
+εκεινουσ
+οπωσ
+ομωσ
+ισωσ
+οσο
+οτι
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/eu/stopwords.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/eu/stopwords.txt
@ -0,0 +1,99 @@
+# example set of basque stopwords
+al
+anitz
+arabera
+asko
+baina
+bat
+batean
+batek
+bati
+batzuei
+batzuek
+batzuetan
+batzuk
+bera
+beraiek
+berau
+berauek
+bere
+berori
+beroriek
+beste
+bezala
+da
+dago
+dira
+ditu
+du
+dute
+edo
+egin
+ere
+eta
+eurak
+ez
+gainera
+gu
+gutxi
+guzti
+haiei
+haiek
+haietan
+hainbeste
+hala
+han
+handik
+hango
+hara
+hari
+hark
+hartan
+hau
+hauei
+hauek
+hauetan
+hemen
+hemendik
+hemengo
+hi
+hona
+honek
+honela
+honetan
+honi
+hor
+hori
+horiei
+horiek
+horietan
+horko
+horra
+horrek
+horrela
+horretan
+horri
+hortik
+hura
+izan
+ni
+noiz
+nola
+non
+nondik
+nongo
+nor
+nora
+ze
+zein
+zen
+zenbait
+zenbat
+zer
+zergatik
+ziren
+zituen
+zu
+zuek
+zuen
+zuten
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/fa/stopwords.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/fa/stopwords.txt
@ -0,0 +1,313 @@
+# This file was created by Jacques Savoy and is distributed under the BSD license.
+# See http://members.unine.ch/jacques.savoy/clef/index.html.
+# Also see http://www.opensource.org/licenses/bsd-license.html
+# Note: by default this file is used after normalization, so when adding entries
+# to this file, use the arabic 'ي' instead of 'ی'
+انان
+نداشته
+سراسر
+خياه
+ايشان
+وي
+تاكنون
+بيشتري
+دوم
+پس
+ناشي
+وگو
+يا
+داشتند
+سپس
+هنگام
+هرگز
+پنج
+نشان
+امسال
+ديگر
+گروهي
+شدند
+چطور
+ده
+و
+دو
+نخستين
+ولي
+چرا
+چه
+وسط
+ه
+كدام
+قابل
+يك
+رفت
+هفت
+همچنين
+در
+هزار
+بله
+بلي
+شايد
+اما
+شناسي
+گرفته
+دهد
+داشته
+دانست
+داشتن
+خواهيم
+ميليارد
+وقتيكه
+امد
+خواهد
+جز
+اورده
+شده
+بلكه
+خدمات
+شدن
+برخي
+نبود
+بسياري
+جلوگيري
+حق
+كردند
+نوعي
+بعري
+نكرده
+نظير
+نبايد
+بوده
+بودن
+داد
+اورد
+هست
+جايي
+شود
+دنبال
+داده
+بايد
+سابق
+هيچ
+همان
+انجا
+كمتر
+كجاست
+گردد
+كسي
+تر
+مردم
+تان
+دادن
+بودند
+سري
+جدا
+ندارند
+مگر
+يكديگر
+دارد
+دهند
+بنابراين
+هنگامي
+سمت
+جا
+انچه
+خود
+دادند
+زياد
+دارند
+اثر
+بدون
+بهترين
+بيشتر
+البته
+به
+براساس
+بيرون
+كرد
+بعضي
+گرفت
+توي
+اي
+ميليون
+او
+جريان
+تول
+بر
+مانند
+برابر
+باشيم
+مدتي
+گويند
+اكنون
+تا
+تنها
+جديد
+چند
+بي
+نشده
+كردن
+كردم
+گويد
+كرده
+كنيم
+نمي
+نزد
+روي
+قصد
+فقط
+بالاي
+ديگران
+اين
+ديروز
+توسط
+سوم
+ايم
+دانند
+سوي
+استفاده
+شما
+كنار
+داريم
+ساخته
+طور
+امده
+رفته
+نخست
+بيست
+نزديك
+طي
+كنيد
+از
+انها
+تمامي
+داشت
+يكي
+طريق
+اش
+چيست
+روب
+نمايد
+گفت
+چندين
+چيزي
+تواند
+ام
+ايا
+با
+ان
+ايد
+ترين
+اينكه
+ديگري
+راه
+هايي
+بروز
+همچنان
+پاعين
+كس
+حدود
+مختلف
+مقابل
+چيز
+گيرد
+ندارد
+ضد
+همچون
+سازي
+شان
+مورد
+باره
+مرسي
+خويش
+برخوردار
+چون
+خارج
+شش
+هنوز
+تحت
+ضمن
+هستيم
+گفته
+فكر
+بسيار
+پيش
+براي
+روزهاي
+انكه
+نخواهد
+بالا
+كل
+وقتي
+كي
+چنين
+كه
+گيري
+نيست
+است
+كجا
+كند
+نيز
+يابد
+بندي
+حتي
+توانند
+عقب
+خواست
+كنند
+بين
+تمام
+همه
+ما
+باشند
+مثل
+شد
+اري
+باشد
+اره
+طبق
+بعد
+اگر
+صورت
+غير
+جاي
+بيش
+ريزي
+اند
+زيرا
+چگونه
+بار
+لطفا
+مي
+درباره
+من
+ديده
+همين
+گذاري
+برداري
+علت
+گذاشته
+هم
+فوق
+نه
+ها
+شوند
+اباد
+همواره
+هر
+اول
+خواهند
+چهار
+نام
+امروز
+مان
+هاي
+قبل
+كنم
+سعي
+تازه
+را
+هستند
+زير
+جلوي
+عنوان
+بود
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/ga/stopwords.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/ga/stopwords.txt
@ -0,0 +1,110 @@
+
+a
+ach
+ag
+agus
+an
+aon
+ar
+arna
+as
+b'
+ba
+beirt
+bhúr
+caoga
+ceathair
+ceathrar
+chomh
+chtó
+chuig
+chun
+cois
+céad
+cúig
+cúigear
+d'
+daichead
+dar
+de
+deich
+deichniúr
+den
+dhá
+do
+don
+dtí
+dá
+dár
+dó
+faoi
+faoin
+faoina
+faoinár
+fara
+fiche
+gach
+gan
+go
+gur
+haon
+hocht
+i
+iad
+idir
+in
+ina
+ins
+inár
+is
+le
+leis
+lena
+lenár
+m'
+mar
+mo
+mé
+na
+nach
+naoi
+naonúr
+ná
+ní
+níor
+nó
+nócha
+ocht
+ochtar
+os
+roimh
+sa
+seacht
+seachtar
+seachtó
+seasca
+seisear
+siad
+sibh
+sinn
+sna
+sé
+sí
+tar
+thar
+thú
+triúr
+trí
+trína
+trínár
+tríocha
+tú
+um
+ár
+é
+éis
+í
+ó
+ón
+óna
+ónár
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/gl/galician.rslp
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/gl/galician.rslp
@ -0,0 +1,647 @@
+#	Steps file for the RSLP stemmer.
+
+# Step 1: Plural Reduction
+{  "Plural", 3, 1, {"s"}, 
+  # bons -> bon
+  {"ns",1,"n",{"luns","furatapóns","furatapons"}},
+  # xamós -> xamón
+  {"ós",3,"ón"},
+  # balões -> balón
+  {"ões",3,"ón"},
+  # capitães -> capitão
+  {"ães",1,"ão",{"mães","magalhães"}},
+  # normais -> normal
+  {"ais",2,"al",{"cais","tais","mais","pais","ademais"}},
+  {"áis",2,"al",{"cáis","táis", "máis", "páis", "ademáis"}},
+  # papéis -> papel
+  {"éis",2,"el"},
+  # posíbeis -> posíbel
+  {"eis",2,"el"},
+  # espanhóis -> espanhol
+  {"óis",2,"ol",{"escornabóis"}},
+  # caracois -> caracol
+  {"ois",2,"ol",{"escornabois"}},
+  # cadrís -> cadril
+  {"ís",2,"il",{"país"}},
+  # cadris -> cadril
+  {"is",2,"il",{"menfis","pais","kinguis"}},
+  # males -> mal
+  {"les",2,"l",{"ingles","marselles","montreales","senegales","manizales","móstoles","nápoles"}},
+  # mares -> mar
+  {"res",3,"r",{"petres","henares","cáceres","baleares","linares","londres","mieres","miraflores","mércores","venres", "pires"}},
+  # luces -> luz
+  {"ces",2,"z"},
+  # luzes -> luz
+  {"zes",2,"z"},
+  # leises -> lei
+  {"ises",3,"z"},
+  # animás -> animal
+  {"ás",1,"al",{"más"}},
+  # gases -> gas
+  {"ses",2,"s"},
+  # casas -> casa
+  {"s",2,"",{"barbadés","barcelonés","cantonés","gabonés","llanés","medinés","escocés","escocês","francês","barcelonês","cantonês","macramés","reves","barcelones","cantones","gabones","llanes","magallanes","medines","escoces","frances","xoves","martes","aliás","pires","lápis","cais","mais","mas","menos","férias","pêsames","crúcis","país","cangas","atenas","asturias","canarias","filipinas","honduras","molucas","caldas","mascareñas","micenas","covarrubias","psoas","óculos","nupcias","xoves","martes","llanes"}}};
+
+{  "Unification", 0, 0, {},
+  # cansadísimo -> cansadísimo
+  {"íssimo",5,"ísimo"},
+  # cansadísima -> cansadísima
+  {"íssima",5,"ísima"},
+  # homaço -> homazo
+  {"aço",4,"azo"},
+  # mulheraça -> mulheraza
+  {"aça",4,"aza"},
+  # xentuça -> xentuza
+  {"uça",4,"uza"},
+  # manilhar -> manillar
+  {"lhar",2,"llar"},
+  # colher -> coller
+  {"lher",2,"ller"},
+  # melhor -> mellor
+  {"lhor",2,"llor"},
+  # alho -> allo
+  {"lho",1,"llo"},
+  # linhar -> liñar
+  {"nhar",2,"ñar"},
+  # penhor -> peñor
+  {"nhor",2,"ñor"},
+  # anho -> año
+  {"nho",1,"ño"},
+  # cunha -> cuña
+  {"nha",1,"ña"},
+  # hospitalário -> hospitalario
+  {"ário",3,"ario"},
+  # bibliotecária -> bibliotecaria
+  {"ária",3,"aria"},
+  # agradable -> agradábel
+  {"able",2,"ábel"},
+  # agradávele -> agradábel
+  {"ável",2,"ábel"},
+  # imposible -> imposíbel
+  {"ible",2,"íbel"},
+  # imposível -> imposíbel
+  {"ível",2,"íbel"},
+  # imposiçom -> imposición
+  {"çom",2,"ción"},
+  # garagem -> garaxe
+  {"agem",2,"axe"},
+  # garage -> garaxe
+  {"age",2,"axe"},
+  # impressão -> impressón
+  {"ão",3,"ón"},
+  # irmao -> irmán
+  {"ao",1,"án"},
+  # irmau -> irmán
+  {"au",1,"án"},
+  # garrafom -> garrafón
+  {"om",3,"ón"},
+  # cantem -> canten
+  {"m",2,"n"}};
+
+{  "Adverb", 0, 0, {},
+  # felizmente -> feliz
+  {"mente",4,"",{"experimente","vehemente","sedimente"}}};
+
+{  "Augmentative", 0, 1, {},
+  # cansadísimo -> cansad
+  {"dísimo",5},
+  # cansadísima -> cansad
+  {"dísima",5},
+  # amabilísimo -> ama
+  {"bilísimo",3},
+  # amabilísima -> ama
+  {"bilísima",3},
+  # fortísimo -> fort
+  {"ísimo",3},
+  # fortísima -> fort
+  {"ísima",3},
+  # centésimo -> cent
+  {"ésimo",3},
+  # centésima -> cent
+  {"ésima",3},
+  # paupérrimo -> paup
+  {"érrimo",4},
+  # paupérrima -> paup
+  {"érrima",4},
+  # charlatana -> charlat
+  {"ana",2,"",{"argana","banana","choupana","espadana","faciana","iguana","lantana","macana","membrana","mesana","nirvana","obsidiana","palangana","pavana","persiana","pestana","porcelana","pseudomembrana","roldana","sábana","salangana","saragana","ventana"}},
+  # charlatán -> charlat
+  {"án",3,"",{"ademán","bardán","barregán","corricán","curricán","faisán","furacán","fustán","gabán","gabián","galán","gañán","lavacán","mazán","mourán","rabadán","serán","serrán","tabán","titán","tobogán","verán","volcán","volován"}},
+  # homazo -> hom
+  {"azo",4,"",{"abrazo","espazo","andazo","bagazo","balazo","bandazo","cachazo","carazo","denazo","engazo","famazo","lampreazo","pantocazo","pedazo","preñazo","regazo","ribazo","sobrazo","terrazo","trompazo"}},
+  # mulleraza -> muller
+  {"aza",3,"",{"alcarraza","ameaza","baraza","broucaza","burgaza","cabaza","cachaza","calaza","carpaza","carraza","coiraza","colmaza","fogaza","famaza","labaza","liñaza","melaza","mordaza","paraza","pinaza","rabaza","rapaza","trancaza"}},
+  # cascallo -> casc
+  {"allo",4,"",{"traballo"}},
+  # xentalla -> xent
+  {"alla",4},
+  # bocarra -> boc
+  {"arra",3,"",{"cigarra","cinzarra"}},
+  # medicastro -> medic
+  {"astro",3,"",{"balastro","bimbastro","canastro","retropilastro"}},
+  # poetastra -> poet
+  {"astra",3,"",{"banastra","canastra","contrapilastra","piastra","pilastra"}},
+  # corpázio -> corp
+  {"ázio",3,"",{"topázio"}},
+  # soutelo -> sout
+  {"elo",4,"",{"bacelo","barrelo","bicarelo","biquelo","boquelo","botelo","bouquelo","cacarelo","cachelo","cadrelo","campelo","candelo","cantelo","carabelo","carambelo","caramelo","cercelo","cerebelo","chocarelo","coitelo","conchelo","corbelo","cotobelo","couselo","destelo","desvelo","esfácelo","fandelo","fardelo","farelo","farnelo","flabelo","ganchelo","garfelo","involucelo","mantelo","montelo","outerelo","padicelo","pesadelo","pinguelo","piquelo","rampelo","rastrelo","restelo","tornecelo","trabelo","restrelo","portelo","ourelo","zarapelo"}},
+  # avioneta -> avion
+  {"eta",3,"",{"arqueta","atleta","avoceta","baioneta","baldeta","banqueta","barraganeta","barreta","borleta","buceta","caceta","calceta","caldeta","cambeta","canaleta","caneta","carreta","cerceta","chaparreta","chapeta","chareta","chincheta","colcheta","cometa","corbeta","corveta","cuneta","desteta","espeta","espoleta","estafeta","esteta","faceta","falanxeta","frasqueta","gaceta","gabeta","galleta","garabeta","gaveta","glorieta","lagareta","lambeta","lanceta","libreta","maceta","macheta","maleta","malleta","mareta","marreta","meseta","mofeta","muleta","peseta","planeta","raqueta","regreta","saqueta","veleta","vendeta","viñeta"}},
+  # guapete -> guap
+  {"ete",3,"",{"alfinete","ariete","bacinete","banquete","barallete","barrete","billete","binguelete","birrete","bonete","bosquete","bufete","burlete","cabalete","cacahuete","cavinete","capacete","carrete","casarete","casete","chupete","clarinete","colchete","colete","capete","curupete","disquete","estilete","falsete","ferrete","filete","gallardete","gobelete","inglete","machete","miquelete","molete","mosquete","piquete","ribete","rodete","rolete","roquete","sorvete","vedete","vendete"}},
+  # práctica -> práct
+  {"ica",3,"",{"andarica","botánica","botica","dialéctica","dinámica","física","formica","gráfica","marica","túnica"}},
+  # práctico -> práct
+  {"ico",3,"",{"conico","acetifico","acidifico"}},
+  # trapexo -> trap
+  {"exo",3,"",{"arpexo","arquexo","asexo","axexo","azulexo","badexo","bafexo","bocexo","bosquexo","boubexo","cacarexo","carrexo","cascarexo","castrexo","convexo","cotexo","desexo","despexo","forcexo","gabexo","gargarexo","gorgolexo","inconexo","manexo","merexo","narnexo","padexo","patexo","sopexo","varexo"}},
+  {"exa",3,"",{"airexa","bandexa","carrexa","envexa","igrexa","larexa","patexa","presexa","sobexa"}},
+  # multidão -> mult
+  {"idão",3},
+  # pequeniño -> pequeno
+  {"iño",3,"o",{"camiño","cariño","comiño","golfiño","padriño","sobriño","viciño","veciño"}},
+  # pequeniña -> pequena
+  {"iña",3,"a",{"camariña","campiña","entreliña","espiña","fariña","moriña","valiña"}},
+  # grandito -> grand
+  {"ito",3,""},
+  # grandita -> grand
+  {"ita",3,""},
+  # anomaloide -> animal
+  {"oide",3,"",{"anaroide","aneroide","asteroide","axoide","cardioide","celuloide","coronoide","discoide","espermatozoide","espiroide","esquizoide","esteroide","glenoide","linfoide","hemorroide","melaloide","sacaroide","tetraploide","varioloide"}},
+  # cazola -> caz
+  {"ola",3,"",{"aixola","ampola","argola","arola","arteríola","bandola","bítola","bractéola","cachola","carambola","carapola","carola","carrandiola","catrapola","cebola","centola","champola","chatola","cirola","cítola","consola","corola","empola","escarola","esmola","estola","fitola","florícola","garañola","gárgola","garxola","glicocola","góndola","mariola","marola","michola","pirola","rebola","rupícola","saxícola","sémola","tachola","tómbola"}},
+  # pedrolo -> pedr
+  {"olo",3,"",{"arrolo","babiolo","cacharolo","caixarolo","carolo","carramolo","cascarolo","cirolo","codrolo","correolo","cotrolo","desconsolo","rebolo","repolo","subsolo","tixolo","tómbolo","torolo","trémolo","vacúolo","xermolo","zócolo"}},
+  # vellote -> vell
+  {"ote",3,"",{"aigote","alcaiote","barbarote","balote","billote","cachote","camarote","capote","cebote","chichote","citote","cocorote","escote","gañote","garrote","gavote","lamote","lapote","larapote","lingote","lítote","magote","marrote","matalote","pandote","paparote","rebote","tagarote","zarrote"}},
+  # mozota -> moz
+  {"ota",3,"",{"asíntota","caiota","cambota","chacota","compota","creosota","curota","derrota","díspota","gamota","maniota","pelota","picota","pillota","pixota","queirota","remota"}},
+  # gordocho -> gord
+  {"cho",3,"",{"abrocho","arrocho","carocho","falucho","bombacho","borracho","mostacho"}},
+  # gordecha -> gord
+  {"cha",3,"",{"borracha","carracha","estacha","garnacha","limacha","remolacha","abrocha"}},
+  # baratuco -> barat
+  {"uco",4,"",{"caduco","estuco","fachuco","malluco","saluco","trabuco"}},
+  # borrachuzo -> borrach
+  {"uzo",3,"",{"carriñouzo","fachuzo","mañuzo","mestruzo","tapuzo"}},
+  # xentuza -> xent
+  {"uza",3,"",{"barruza","chamuza","chapuza","charamuza","conduza","deduza","desluza","entreluza","induza","reluza","seduza","traduza","trasluza"}},
+  # babuxa -> bab
+  {"uxa",3,"",{"caramuxa","carrabouxa","cartuxa","coruxa","curuxa","gaturuxa","maruxa","meruxa","miruxa","moruxa","muruxa","papuxa","rabuxa","trouxa"}},
+  {"uxo",3,"",{"caramuxo","carouxo","carrabouxo","curuxo","debuxo","ganduxo","influxo","negouxo","pertuxo","refluxo"}},
+  # grupello -> grup
+  {"ello",3,"",{"alborello","artello","botello","cachafello","calello","casarello","cazabello","cercello","cocerello","concello","consello","desparello","escaravello","espello","fedello","fervello","gagafello","gorrobello","nortello","pendello","troupello","trebello"}},
+  # pontella -> pont
+  {"ella",3,"",{"alborella","bertorella","bocatella","botella","calella","cercella","gadella","grosella","lentella","movella","nocella","noitevella","parella","pelella","percebella","segorella","sabella"}}};
+
+{  "Noun", 0, 0, {},
+  # lealdade -> leal 
+  {"dade",3,"",{"acridade","calidade"}},
+  # clarificar -> clar
+  {"ificar",2},
+  # brasileiro->brasil
+  {"eiro",3,"",{"agoireiro","bardalleiro","braseiro","barreiro","canteiro","capoeiro","carneiro","carteiro","cinceiro","faroleiro","mareiro","preguiceiro","quinteiro","raposeiro","retranqueiro","regueiro","sineiro","troleiro","ventureiro"}},
+  # marisqueira -> marisqu
+  {"eira",3,"",{"cabeleira","canteira","cocheira","folleira","milleira"}},
+  # hospitalario -> hospital
+  {"ario",3,"",{"armario","calcario","lionario","salario"}},
+  # bibliotecaria -> bibliotec
+  {"aria",3,"",{"cetaria","coronaria","fumaria","linaria","lunaria","parietaria","saponaria","serpentaria"}},
+  # humorístico -> humor
+  {"ístico",3,"",{"balístico", "ensaístico"}},
+  # castrista -> castr
+  {"ista",3,"",{"batista","ciclista","fadista","operista","tenista","verista"}},
+  # lavado -> lav
+  {"ado",2,"",{"grado","agrado"}},
+  # decanato -> decan
+  {"ato",2,"",{"agnato"}},
+  # xemido -> xem
+  {"ido",3,"",{"cándido","cândido","consolido","decidido","duvido","marido","rápido"}},
+  # mantida -> mant
+  {"ida",3,"",{"bastida","dúbida","dubida","duvida","ermida","éxida","guarida","lapicida","medida","morida"}},
+  {"ída",3},
+  # mantído -> mant
+  {"ido",3},
+  # orelludo -> orell
+  {"udo",3,"",{"estudo","escudo"}},
+  # orelluda -> orell
+  {"uda",3},
+  {"ada",3,"",{"abada","alhada","allada","pitada"}},
+  # comedela -> come
+  {"dela",3,"",{"cambadela","cavadela","forcadela","erisipidela","mortadela","espadela","fondedela","picadela","arandela","candela","cordela","escudela","pardela"}},
+  # fontela -> font
+  {"ela",3,"",{"canela","capela","cotela","cubela","curupela","escarapela","esparrela","estela","fardela","flanela","fornela","franela","gabela","gamela","gavela","glumela","granicela","lamela","lapela","malvela","manela","manganela","mexarela","micela","mistela","novela","ourela","panela","parcela","pasarela","patamela","patela","paxarela","pipela","pitela","postela","pubela","restela","sabela","salmonela","secuela","sentinela","soldanela","subela","temoncela","tesela","tixela","tramela","trapela","varela","vitela","xanela","xestela"}},
+  # agradábel -> agrad
+  {"ábel",2,"",{"afábel","fiábel"}},
+  # combustíbel -> combust
+  {"íbel",2,"",{"críbel","imposíbel","posíbel","fisíbel","falíbel"}},
+  # fabricante -> frabrica
+  {"nte",3,"",{"alimente","adiante","acrescente","elefante","frequente","freqüente","gigante","instante","oriente","permanente","posante","possante","restaurante"}},
+  # ignorancia -> ignora
+  {"ncia",3},
+  # temperanza -> tempera
+  {"nza",3},
+  {"acia",3,"",{"acracia","audacia","falacia","farmacia"}},
+  # inmundicia -> inmund
+  {"icia",3,"",{"caricia","delicia","ledicia","malicia","milicia","noticia","pericia","presbicia","primicia","regalicia","sevicia","tiricia"}},
+  # xustiza -> xust
+  {"iza",3,"",{"alvariza","baliza","cachiza","caniza","cañiza","carbaliza","carriza","chamariza","chapiza","fraguiza","latiza","longaniza","mañiza","nabiza","peliza","preguiza","rabiza"}},
+  # clarexar -> clar
+  {"exar",3,"",{"palmexar"}},
+  # administración -> administr
+  {"ación",2,"",{"aeración"}},
+  # expedición -> exped
+  {"ición",3,"",{"condición","gornición","monición","nutrición","petición","posición","sedición","volición"}},
+  # excepción -> except
+  {"ción",3,"t"},
+  # comprensión -> comprens
+  {"sión",3,"s",{"abrasión", "alusión"}},
+  # doazón -> do
+  {"azón",2,"",{"armazón"}},
+  # garrafón -> garraf
+  {"ón",3,"",{"abalón","acordeón","alción","aldrabón","alerón","aliñón","ambón","bombón","calzón","campón","canalón","cantón","capitón","cañón","centón","ciclón","collón","colofón","copón","cotón","cupón","petón","tirón","tourón","turón","unción","versión","zubón","zurrón"}},
+  # lambona -> lamb
+  {"ona",3,"",{"abandona","acetona","aleurona","amazona","anémona","bombona","cambona","carona","chacona","charamona","cincona","condona","cortisona","cretona","cretona","detona","estona","fitohormona","fregona","gerona","hidroquinona","hormona","lesiona","madona","maratona","matrona","metadona","monótona","neurona","pamplona","peptona","poltrona","proxesterona","quinona","quinona","silicona","sulfona"}},
+  # bretoa -> bretón
+  {"oa",3,"",{"abandoa","madroa","barbacoa","estoa","airoa","eiroa","amalloa","ámboa","améndoa","anchoa","antinéboa","avéntoa","avoa","bágoa","balboa","bisavoa","boroa","canoa","caroa","comadroa","coroa","éngoa","espácoa","filloa","fírgoa","grañoa","lagoa","lanzoa","magoa","mámoa","morzoa","noiteboa","noraboa","parañoa","persoa","queiroa","rañoa","táboa","tataravoa","teiroa"}},
+  # demoníaco -> demoní
+  {"aco",3},
+  # demoníaca -> demoní
+  {"aca",3,"",{"alpaca","barraca","bullaca","buraca","carraca","casaca","cavaca","cloaca","entresaca","ervellaca","espinaca","estaca","farraca","millaca","pastinaca","pataca","resaca","urraca","purraca"}},
+  # carballal -> carball
+  {"al",4,"",{"afinal","animal","estatal","bisexual","bissexual","desleal","fiscal","formal","pessoal","persoal","liberal","postal","virtual","visual","pontual","puntual","homosexual","heterosexual"}},
+  # nadador -> nada
+  {"dor",2,"",{"abaixador"}},
+  # benfeitor -> benfei
+  {"tor",3,"",{"autor","motor","pastor","pintor"}},
+  # produtor -> produt
+  {"or",2,"",{"asesor","assessor","favor","mellor","melhor","redor","rigor","sensor","tambor","tumor"}},
+  # profesora -> profes
+  {"ora",3,"",{"albacora","anáfora","áncora","apisoadora","ardora","ascospora","aurora","avéspora","bitácora","canéfora","cantimplora","catáfora","cepilladora","demora","descalcificadora","diáspora","empacadora","epífora","ecavadora","escora","eslora","espora","fotocompoñedora","fotocopiadora","grampadora","isícora","lavadora","lixadora","macrospora","madrépora","madrágora","masora","mellora","metáfora","microspora","milépora","milpéndora","nécora","oospora","padeadora","pasiflora","pécora","píldora","pólvora","ratinadora","rémora","retroescavadora","sófora","torradora","trémbora","uredospora","víbora","víncora","zoospora"}},
+  # zapataría -> zapat
+  {"aría",3,"",{"libraría"}},
+  # etiquetaxe -> etiquet
+  {"axe",3,"",{"aluaxe","amaraxe","amperaxe","bagaxe","balaxe","barcaxe","borraxe","bescaxe","cabotaxe","carraxe","cartilaxe","chantaxe","colaxe","coraxe","carruaxe","dragaxe","embalaxe","ensilaxe","epistaxe","fagundaxe","fichaxe","fogaxe","forraxe","fretaxe","friaxe","garaxe","homenaxe","leitaxe","liñaxe","listaxe","maraxe","marcaxe","maridaxe","masaxe","miraxe","montaxe","pasaxe","peaxe","portaxe","ramaxe","rebelaxe","rodaxe","romaxe","sintaxe","sondaxe","tiraxe","vantaxe","vendaxe","viraxe"}},
+  # movedizo -> move
+  {"dizo",3},
+  # limpeza -> limp
+  {"eza",3,"",{"alteza","beleza","fereza","fineza","vasteza","vileza"}},
+  # rixidez -> rixid
+  {"ez",3,"",{"acidez","adultez","adustez","avidez","candidez","mudez","nenez","nudez","pomez"}},
+  # mullerengo -> muller
+  {"engo",3},
+  # chairego -> chair
+  {"ego",3,"",{"corego","derrego","entrego","lamego","sarego","sartego"}},
+  # cariñoso -> cariñ
+  {"oso",3,"",{"afanoso","algoso","caldoso","caloso","cocoso","ditoso","favoso","fogoso","lamoso","mecoso","mocoso","precioso","rixoso","venoso","viroso","xesoso"}},
+  # cariñosa -> cariñ
+  {"osa",3,"",{"mucosa","glicosa","baldosa","celulosa","isoglosa","nitrocelulosa","levulosa","ortosa","pectosa","preciosa","sacarosa","serosa","ventosa"}},
+  # negrume -> negr
+  {"ume",3,"",{"agrume","albume","alcume","batume","cacume","cerrume","chorume","churume","costume","curtume","estrume","gafume","legume","perfume","queixume","zarrume"}},
+  # altura -> alt
+  {"ura",3,"",{"albura","armadura","imatura","costura"}},
+  # cuspiñar -> cusp
+  {"iñar",3},
+  # febril -> febr
+  {"il",3,"",{"abril","alfil","anil","atril","badil","baril","barril","brasil","cadril","candil","cantil","carril","chamil","chancil","civil","cubil","dátil","difícil","dócil","edil","estéril","fácil","fráxil","funil","fusil","grácil","gradil","hábil","hostil","marfil"}},
+  # principesco -> princip
+  {"esco",4},
+  # mourisco -> mour
+  {"isco",4},
+  # esportivo -> esport
+  {"ivo",3,"",{"pasivo","positivo","passivo","possessivo","posesivo","pexotarivo","relativo"}}};
+
+{  "Verb", 0, 0, {},
+  # amaba -> am
+  {"aba",2},
+  # andabade -> and
+  {"abade",2},
+  # andábade -> and
+  {"ábade",2},
+  # chorabamo -> chor
+  {"abamo",2},
+  # chorábamo -> chor
+  {"ábamo",2}, 
+  # moraban -> morab
+  {"aban",2},
+  # andache -> and
+  {"ache",2},
+  # andade -> and
+  {"ade",2},
+  {"an",2}, 
+  # cantando -> cant
+  {"ando",2},
+  # cantar -> cant
+  {"ar",2,"",{"azar","bazar","patamar"}}, 
+  # lembrarade -> lembra
+  {"arade",2},
+  {"aramo",2}, 
+  {"arán",2},
+  # cantaran -> cant
+  {"aran",2},
+  # convidárade -> convid
+  {"árade",2},
+  # convidaría -> convid
+  {"aría",2},
+  # cantariade -> cant
+  {"ariade",2},
+  # cantaríade -> cant
+  {"aríade",2},
+  # cantarian -> cant 
+  {"arian",2},
+  # cantariamo -> cant
+  {"ariamo",2},
+  # pescaron -> pesc
+  {"aron",2},
+  # cantase -> cant
+  {"ase",2},
+  # cantasede -> cant
+  {"asede",2},
+  # cantásede -> cant
+  {"ásede",2},
+  # cantasemo -> cant
+  {"asemo",2},
+  # cantásemo -> cant
+  {"ásemo",2},
+  # cantasen -> cant
+  {"asen",2},
+  # loitavan -> loitav
+  {"avan",2},
+  # cantaríamo -> cant
+  {"aríamo",2},
+  # cantassen -> cant
+  {"assen",2},
+  # cantássemo -> cant
+  {"ássemo",2},
+  # beberíamo -> beb
+  {"eríamo",2},
+  # bebêssemo -> beb
+  {"êssemo",2},
+  # partiríamo -> part
+  {"iríamo",3},
+  # partíssemo -> part
+  {"íssemo",3},
+  # cantáramo -> cant
+  {"áramo",2},
+  # cantárei -> cant
+  {"árei",2},
+  # cantaren -> cant
+  {"aren",2},
+  # cantaremo -> cant
+  {"aremo",2},
+  # cantaríei -> cant
+  {"aríei",2},
+  {"ássei",2},
+  # cantávamo-> cant
+  {"ávamo",2},
+  # bebêramo -> beb
+  {"êramo",1},
+  # beberemo -> beb
+  {"eremo",1},
+  # beberíei -> beb
+  {"eríei",1},
+  # bebêssei -> beb
+  {"êssei",1},
+  # partiríamo -> part
+  {"íramo",3},
+  # partiremo -> part
+  {"iremo",3},
+  # partiríei -> part
+  {"iríei",3},
+  # partíssei -> part
+  {"íssei",3},
+  # partissen -> part
+  {"issen",3},
+  # bebendo -> beb
+  {"endo",1},
+  # partindo -> part
+  {"indo",3},
+  # propondo -> prop
+  {"ondo",3},
+  # cantarde -> cant
+  {"arde",2},
+  # cantarei -> cant
+  {"arei",2},
+  # cantaria -> cant
+  {"aria",2},
+  # cantarmo -> cant
+  {"armo",2},
+  # cantasse -> cant
+  {"asse",2},
+  {"aste",2},
+  # cantávei -> cant
+  {"ávei",2},
+  # perderão -> perd
+  {"erão",1},
+  # beberde -> beb
+  {"erde",1},
+  # beberei -> beb
+  {"erei",1},
+  # bebêrei -> beb
+  {"êrei",1},
+  # beberen -> beb
+  {"eren",2},
+  # beberia -> beb
+  {"eria",1},
+  # bebermo -> beb
+  {"ermo",1},
+  # bebeste -> beb
+  {"este",1,"",{"faroeste","agreste"}},
+  # bebíamo -> beb
+  {"íamo",1},
+  # fuxian -> fux
+  {"ian",2,"",{"enfian","eloxian","ensaian"}},
+  # partirde -> part
+  {"irde",2},
+  # partírei -> part
+  {"irei",3,"",{"admirei"}},
+  # partiren -> part
+  {"iren",3},
+  # partiria -> part
+  {"iria",3},
+  # partirmo -> part
+  {"irmo",3},
+  # partisse -> part
+  {"isse",3},
+  # partiste -> part
+  {"iste",4},
+  {"iava",1,"",{"ampliava"}},
+  # cantamo -> cant
+  {"amo",2},
+  # funciona -> func
+  {"iona",3},
+  # cantara -> cant
+  {"ara",2,"",{"arara","prepara"}},
+  # enviará -> envi
+  {"ará",2,"",{"alvará","bacará"}},
+  # cantare -> cant
+  {"are",2,"",{"prepare"}},
+  # cantava -> cant
+  {"ava",2,"",{"agrava"}},
+  # cantemo -> cant
+  {"emo",2},
+  # bebera -> beb
+  {"era",1,"",{"acelera","espera"}},
+  # beberá -> beb
+  {"erá",1},
+  # bebere -> beb
+  {"ere",1,"",{"espere"}},
+  # bebíei -> beb
+  {"íei",1},
+  # metin -> met
+  {"in",3},
+  # partimo -> part
+  {"imo",3,"",{"reprimo","intimo","íntimo","nimo","queimo","ximo"}},
+  # partira -> part
+  {"ira",3,"",{"fronteira","sátira"}},
+  {"ído",3},
+  # partirá -> part
+  {"irá",3},
+  # concretizar -> concret
+  {"tizar",4,"",{"alfabetizar"}},
+  {"izar",3,"",{"organizar"}},
+  # saltitar -> salt
+  {"itar",5,"",{"acreditar","explicitar","estreitar"}},
+  # partire -> part
+  {"ire",3,"",{"adquire"}},
+  # compomo -> comp
+  {"omo",3},
+  {"ai",2},
+  # barbear -> barb
+  {"ear",4,"",{"alardear","nuclear"}},
+  # cheguei -> cheg
+  {"uei",3},
+  {"uía",5,"u"},
+  # cantei -> cant
+  {"ei",3},
+  # beber -> beb
+  {"er",1,"",{"éter","pier"}},
+  # bebeu -> beb
+  {"eu",1,"",{"chapeu"}},
+  # bebia -> beb
+  {"ia",1,"",{"estória","fatia","acia","praia","elogia","mania","lábia","aprecia","polícia","arredia","cheia","ásia"}},
+  # partir -> part
+  {"ir",3},
+  # partiu -> part
+  {"iu",3},
+  # fraqueou -> fraqu
+  {"eou",5},
+  # chegou -> cheg
+  {"ou",3},
+  # bebi -> beb
+  {"i",1},
+  # varrede -> varr
+  {"ede",1,"",{"rede","bípede","céspede","parede","palmípede","vostede","hóspede","adrede"}},
+  # cantei -> cant
+  {"ei",3},
+  # anden -> and
+  {"en",2},
+  # descerade -> desc
+  {"erade",1},
+  # vivérade -> viv
+  {"érade",1},
+  # beberan -> beb
+  {"eran",2},
+  # colleramo -> coller
+  {"eramo",1},
+  # bebéramo -> beb
+  {"éramo",1},
+  # perderán -> perd
+  {"erán",1},
+  # varrería -> varr
+  {"ería",1},
+  # beberiade -> beb
+  {"eriade",1},
+  # beberíade -> beb
+  {"eríade",1},
+  # beberiamo -> beb
+  {"eriamo",1},
+  # beberian -> beb
+  {"erian",1},
+  # beberían -> beb
+  {"erían",1},
+  # perderon -> perd
+  {"eron",1},
+  # bebese -> beb
+  {"ese",1},
+  # bebesedes -> beb
+  {"esedes",1},
+  # bebésedes -> beb
+  {"ésedes",1}, 
+  # bebesemo -> beb
+  {"esemo",1},
+  # bebésemo -> beb
+  {"ésemo",1},
+  # bebesen -> beb
+  {"esen",1},
+  # bebêssede -> beb 
+  {"êssede",1},
+  # chovía -> chov
+  {"ía",1},
+  # faciade -> fac
+  {"iade",1},
+  # facíade -> fac
+  {"íade",1},
+  # perdiamo -> perd
+  {"iamo",1},
+  # fuxían -> fux 
+  {"ían",1},
+  # corriche -> corr
+  {"iche",1},
+  # partide -> part
+  {"ide",1},
+  # escribirade -> escrib
+  {"irade",3},
+  # parírade -> par
+  {"írade",3},
+  # partiramo -> part
+  {"iramo",3}, 
+  # fugirán -> fug
+  {"irán",3},
+  # viviría -> viv
+  {"iría",3},
+  # partiriade -> part
+  {"iriade",3},
+  # partiríade -> part
+  {"iríade",3},
+  # partiriamo -> part
+  {"iriamo",3}, 
+  # partirian -> part
+  {"irian",3},
+  # partirían -> part
+  {"irían",3},
+  # reflectiron -> reflect
+  {"iron",3},
+  # partise -> part
+  {"ise",3},
+  # partisede -> part
+  {"isede",3},
+  # partísede -> part
+  {"ísede",3},
+  # partisemo -> part
+  {"isemo",3},
+  # partísemo -> part
+  {"ísemo",3},
+  # partisen -> part
+  {"isen",3},
+  # partíssede -> part
+  {"íssede",3}, 
+  {"tizar",3,"",{"alfabetizar"}},
+  {"ondo",3}};
+
+{  "Vowel", 0, 0, {},
+  # segue -> seg
+  {"gue",2,"g",{"azougue","dengue","merengue","nurague","merengue","rengue"}},
+  {"que",2,"c",{"alambique","albaricoque","abaroque","alcrique","almadraque","almanaque","arenque","arinque","baduloque","ballestrinque","betoque","bivaque","bloque","bodaque","bosque","breque","buque","cacique","cheque","claque","contradique","coque","croque","dique","duque","enroque","espeque","estoque","estoraque","estraloque","estrinque","milicroque","monicreque","orinque","arinque","palenque","parque","penique","picabeque","pique","psique","raque","remolque","xeque","repenique","roque","sotobosque","tabique","tanque","toque","traque","truque","vivaque","xaque"}},
+  {"a",3,"",{"amasadela","cerva"}},
+  {"e",3,"",{"marte"}},
+  {"o",3,"",{"barro","fado","cabo","libro","cervo"}},
+  {"â",3},
+  {"ã",3,"",{"amanhã","arapuã","fã","divã","manhã"}},
+  {"ê",3},
+  {"ô",3},
+  {"á",3},
+  {"é",3},
+  {"ó",3},
+  # munxi -> munx
+  {"i",3}};
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/gl/stopwords.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/gl/stopwords.txt
@ -0,0 +1,161 @@
+# galican stopwords
+a
+aínda
+alí
+aquel
+aquela
+aquelas
+aqueles
+aquilo
+aquí
+ao
+aos
+as
+así
+á
+ben
+cando
+che
+co
+coa
+comigo
+con
+connosco
+contigo
+convosco
+coas
+cos
+cun
+cuns
+cunha
+cunhas
+da
+dalgunha
+dalgunhas
+dalgún
+dalgúns
+das
+de
+del
+dela
+delas
+deles
+desde
+deste
+do
+dos
+dun
+duns
+dunha
+dunhas
+e
+el
+ela
+elas
+eles
+en
+era
+eran
+esa
+esas
+ese
+eses
+esta
+estar
+estaba
+está
+están
+este
+estes
+estiven
+estou
+eu
+é
+facer
+foi
+foron
+fun
+había
+hai
+iso
+isto
+la
+las
+lle
+lles
+lo
+los
+mais
+me
+meu
+meus
+min
+miña
+miñas
+moi
+na
+nas
+neste
+nin
+no
+non
+nos
+nosa
+nosas
+noso
+nosos
+nós
+nun
+nunha
+nuns
+nunhas
+o
+os
+ou
+ó
+ós
+para
+pero
+pode
+pois
+pola
+polas
+polo
+polos
+por
+que
+se
+senón
+ser
+seu
+seus
+sexa
+sido
+sobre
+súa
+súas
+tamén
+tan
+te
+ten
+teñen
+teño
+ter
+teu
+teus
+ti
+tido
+tiña
+tiven
+túa
+túas
+un
+unha
+unhas
+uns
+vos
+vosa
+vosas
+voso
+vosos
+vós
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/hi/stopwords.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/hi/stopwords.txt
@ -0,0 +1,235 @@
+# Also see http://www.opensource.org/licenses/bsd-license.html
+# See http://members.unine.ch/jacques.savoy/clef/index.html.
+# This file was created by Jacques Savoy and is distributed under the BSD license.
+# Note: by default this file also contains forms normalized by HindiNormalizer 
+# for spelling variation (see section below), such that it can be used whether or 
+# not you enable that feature. When adding additional entries to this list,
+# please add the normalized form as well. 
+अंदर
+अत
+अपना
+अपनी
+अपने
+अभी
+आदि
+आप
+इत्यादि
+इन 
+इनका
+इन्हीं
+इन्हें
+इन्हों
+इस
+इसका
+इसकी
+इसके
+इसमें
+इसी
+इसे
+उन
+उनका
+उनकी
+उनके
+उनको
+उन्हीं
+उन्हें
+उन्हों
+उस
+उसके
+उसी
+उसे
+एक
+एवं
+एस
+ऐसे
+और
+कई
+कर
+करता
+करते
+करना
+करने
+करें
+कहते
+कहा
+का
+काफ़ी
+कि
+कितना
+किन्हें
+किन्हों
+किया
+किर
+किस
+किसी
+किसे
+की
+कुछ
+कुल
+के
+को
+कोई
+कौन
+कौनसा
+गया
+घर
+जब
+जहाँ
+जा
+जितना
+जिन
+जिन्हें
+जिन्हों
+जिस
+जिसे
+जीधर
+जैसा
+जैसे
+जो
+तक
+तब
+तरह
+तिन
+तिन्हें
+तिन्हों
+तिस
+तिसे
+तो
+था
+थी
+थे
+दबारा
+दिया
+दुसरा
+दूसरे
+दो
+द्वारा
+न
+नहीं
+ना
+निहायत
+नीचे
+ने
+पर
+पर  
+पहले
+पूरा
+पे
+फिर
+बनी
+बही
+बहुत
+बाद
+बाला
+बिलकुल
+भी
+भीतर
+मगर
+मानो
+मे
+में
+यदि
+यह
+यहाँ
+यही
+या
+यिह 
+ये
+रखें
+रहा
+रहे
+ऱ्वासा
+लिए
+लिये
+लेकिन
+व
+वर्ग
+वह
+वह 
+वहाँ
+वहीं
+वाले
+वुह 
+वे
+वग़ैरह
+संग
+सकता
+सकते
+सबसे
+सभी
+साथ
+साबुत
+साभ
+सारा
+से
+सो
+ही
+हुआ
+हुई
+हुए
+है
+हैं
+हो
+होता
+होती
+होते
+होना
+होने
+# additional normalized forms of the above
+अपनि
+जेसे
+होति
+सभि
+तिंहों
+इंहों
+दवारा
+इसि
+किंहें
+थि
+उंहों
+ओर
+जिंहें
+वहिं
+अभि
+बनि
+हि
+उंहिं
+उंहें
+हें
+वगेरह
+एसे
+रवासा
+कोन
+निचे
+काफि
+उसि
+पुरा
+भितर
+हे
+बहि
+वहां
+कोइ
+यहां
+जिंहों
+तिंहें
+किसि
+कइ
+यहि
+इंहिं
+जिधर
+इंहें
+अदि
+इतयादि
+हुइ
+कोनसा
+इसकि
+दुसरे
+जहां
+अप
+किंहों
+उनकि
+भि
+वरग
+हुअ
+जेसा
+नहिं
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/hy/stopwords.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/hy/stopwords.txt
@ -0,0 +1,46 @@
+# example set of Armenian stopwords.
+այդ
+այլ
+այն
+այս
+դու
+դուք
+եմ
+են
+ենք
+ես
+եք
+է
+էի
+էին
+էինք
+էիր
+էիք
+էր
+ըստ
+թ
+ի
+ին
+իսկ
+իր
+կամ
+համար
+հետ
+հետո
+մենք
+մեջ
+մի
+ն
+նա
+նաև
+նրա
+նրանք
+որ
+որը
+որոնք
+որպես
+ու
+ում
+պիտի
+վրա
+և
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/id/stopwords.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/id/stopwords.txt
@ -0,0 +1,359 @@
+# from appendix D of: A Study of Stemming Effects on Information
+# Retrieval in Bahasa Indonesia
+ada
+adanya
+adalah
+adapun
+agak
+agaknya
+agar
+akan
+akankah
+akhirnya
+aku
+akulah
+amat
+amatlah
+anda
+andalah
+antar
+diantaranya
+antara
+antaranya
+diantara
+apa
+apaan
+mengapa
+apabila
+apakah
+apalagi
+apatah
+atau
+ataukah
+ataupun
+bagai
+bagaikan
+sebagai
+sebagainya
+bagaimana
+bagaimanapun
+sebagaimana
+bagaimanakah
+bagi
+bahkan
+bahwa
+bahwasanya
+sebaliknya
+banyak
+sebanyak
+beberapa
+seberapa
+begini
+beginian
+beginikah
+beginilah
+sebegini
+begitu
+begitukah
+begitulah
+begitupun
+sebegitu
+belum
+belumlah
+sebelum
+sebelumnya
+sebenarnya
+berapa
+berapakah
+berapalah
+berapapun
+betulkah
+sebetulnya
+biasa
+biasanya
+bila
+bilakah
+bisa
+bisakah
+sebisanya
+boleh
+bolehkah
+bolehlah
+buat
+bukan
+bukankah
+bukanlah
+bukannya
+cuma
+percuma
+dahulu
+dalam
+dan
+dapat
+dari
+daripada
+dekat
+demi
+demikian
+demikianlah
+sedemikian
+dengan
+depan
+di
+dia
+dialah
+dini
+diri
+dirinya
+terdiri
+dong
+dulu
+enggak
+enggaknya
+entah
+entahlah
+terhadap
+terhadapnya
+hal
+hampir
+hanya
+hanyalah
+harus
+haruslah
+harusnya
+seharusnya
+hendak
+hendaklah
+hendaknya
+hingga
+sehingga
+ia
+ialah
+ibarat
+ingin
+inginkah
+inginkan
+ini
+inikah
+inilah
+itu
+itukah
+itulah
+jangan
+jangankan
+janganlah
+jika
+jikalau
+juga
+justru
+kala
+kalau
+kalaulah
+kalaupun
+kalian
+kami
+kamilah
+kamu
+kamulah
+kan
+kapan
+kapankah
+kapanpun
+dikarenakan
+karena
+karenanya
+ke
+kecil
+kemudian
+kenapa
+kepada
+kepadanya
+ketika
+seketika
+khususnya
+kini
+kinilah
+kiranya
+sekiranya
+kita
+kitalah
+kok
+lagi
+lagian
+selagi
+lah
+lain
+lainnya
+melainkan
+selaku
+lalu
+melalui
+terlalu
+lama
+lamanya
+selama
+selama
+selamanya
+lebih
+terlebih
+bermacam
+macam
+semacam
+maka
+makanya
+makin
+malah
+malahan
+mampu
+mampukah
+mana
+manakala
+manalagi
+masih
+masihkah
+semasih
+masing
+mau
+maupun
+semaunya
+memang
+mereka
+merekalah
+meski
+meskipun
+semula
+mungkin
+mungkinkah
+nah
+namun
+nanti
+nantinya
+nyaris
+oleh
+olehnya
+seorang
+seseorang
+pada
+padanya
+padahal
+paling
+sepanjang
+pantas
+sepantasnya
+sepantasnyalah
+para
+pasti
+pastilah
+per
+pernah
+pula
+pun
+merupakan
+rupanya
+serupa
+saat
+saatnya
+sesaat
+saja
+sajalah
+saling
+bersama
+sama
+sesama
+sambil
+sampai
+sana
+sangat
+sangatlah
+saya
+sayalah
+se
+sebab
+sebabnya
+sebuah
+tersebut
+tersebutlah
+sedang
+sedangkan
+sedikit
+sedikitnya
+segala
+segalanya
+segera
+sesegera
+sejak
+sejenak
+sekali
+sekalian
+sekalipun
+sesekali
+sekaligus
+sekarang
+sekarang
+sekitar
+sekitarnya
+sela
+selain
+selalu
+seluruh
+seluruhnya
+semakin
+sementara
+sempat
+semua
+semuanya
+sendiri
+sendirinya
+seolah
+seperti
+sepertinya
+sering
+seringnya
+serta
+siapa
+siapakah
+siapapun
+disini
+disinilah
+sini
+sinilah
+sesuatu
+sesuatunya
+suatu
+sesudah
+sesudahnya
+sudah
+sudahkah
+sudahlah
+supaya
+tadi
+tadinya
+tak
+tanpa
+setelah
+telah
+tentang
+tentu
+tentulah
+tentunya
+tertentu
+seterusnya
+tapi
+tetapi
+setiap
+tiap
+setidaknya
+tidak
+tidakkah
+tidaklah
+toh
+waduh
+wah
+wahai
+sewaktu
+walau
+walaupun
+wong
+yaitu
+yakni
+yang
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/lv/stopwords.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/lv/stopwords.txt
@ -0,0 +1,172 @@
+# Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins
+# the original list of over 800 forms was refined: 
+#   pronouns, adverbs, interjections were removed
+# 
+# prepositions
+aiz
+ap
+ar
+apakš
+ārpus
+augšpus
+bez
+caur
+dēļ
+gar
+iekš
+iz
+kopš
+labad
+lejpus
+līdz
+no
+otrpus
+pa
+par
+pār
+pēc
+pie
+pirms
+pret
+priekš
+starp
+šaipus
+uz
+viņpus
+virs
+virspus
+zem
+apakšpus
+# Conjunctions
+un
+bet
+jo
+ja
+ka
+lai
+tomēr
+tikko
+turpretī
+arī
+kaut
+gan
+tādēļ
+tā
+ne
+tikvien
+vien
+kā
+ir
+te
+vai
+kamēr
+# Particles
+ar
+diezin
+droši
+diemžēl
+nebūt
+ik
+it
+taču
+nu
+pat
+tiklab
+iekšpus
+nedz
+tik
+nevis
+turpretim
+jeb
+iekam
+iekām
+iekāms
+kolīdz
+līdzko
+tiklīdz
+jebšu
+tālab
+tāpēc
+nekā
+itin
+jā
+jau
+jel
+nē
+nezin
+tad
+tikai
+vis
+tak
+iekams
+vien
+# modal verbs
+būt  
+biju 
+biji
+bija
+bijām
+bijāt
+esmu
+esi
+esam
+esat 
+būšu     
+būsi
+būs
+būsim
+būsiet
+tikt
+tiku
+tiki
+tika
+tikām
+tikāt
+tieku
+tiec
+tiek
+tiekam
+tiekat
+tikšu
+tiks
+tiksim
+tiksiet
+tapt
+tapi
+tapāt
+topat
+tapšu
+tapsi
+taps
+tapsim
+tapsiet
+kļūt
+kļuvu
+kļuvi
+kļuva
+kļuvām
+kļuvāt
+kļūstu
+kļūsti
+kļūst
+kļūstam
+kļūstat
+kļūšu
+kļūsi
+kļūs
+kļūsim
+kļūsiet
+# verbs
+varēt
+varēju
+varējām
+varēšu
+varēsim
+var
+varēji
+varējāt
+varēsi
+varēsiet
+varat
+varēja
+varēs
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/pt/portuguese.rslp
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/pt/portuguese.rslp
@ -0,0 +1,456 @@
+#  Steps file for the RSLP stemmer.
+
+# Step 1: Plural Reduction
+{  "Plural", 3, 1, {"s"},
+  # bons -> bom 
+  {"ns",1,"m"},
+  # balões -> balão
+  {"ões",3,"ão"},
+  # capitães -> capitão
+  {"ães",1,"ão",{"mães"}},
+  # normais -> normal
+  {"ais",1,"al",{"cais","mais"}},
+  # papéis -> papel
+  {"éis",2,"el"},
+  # amáveis -> amável
+  {"eis",2,"el"},
+  # lençóis -> lençol
+  {"óis",2,"ol"},
+  # barris -> barril
+  {"is",2,"il",{"lápis","cais","mais","crúcis","biquínis","pois","depois","dois","leis"}},
+  # males -> mal
+  {"les",3,"l"},
+  # mares -> mar
+  {"res",3,"r", {"árvores"}},
+  # casas -> casa
+  {"s",2,"",{"aliás","pires","lápis","cais","mais","mas","menos","férias","fezes","pêsames","crúcis","gás","atrás","moisés","através","convés","ês","país","após","ambas","ambos","messias", "depois"}}};
+
+# Step 2: Adverb Reduction
+{  "Adverb", 0, 0, {},
+  # felizmente -> feliz
+  {"mente",4,"",{"experimente"}}};
+  
+# Step 3: Feminine Reduction
+{  "Feminine", 3, 1, {"a","ã"},
+  # chefona -> chefão
+  {"ona",3,"ão",{"abandona","lona","iona","cortisona","monótona","maratona","acetona","detona","carona"}},
+  # vilã -> vilão
+  {"ã",2,"ão",{"amanhã","arapuã","fã","divã"}},
+  # professora -> professor
+  {"ora",3,"or"},
+  # americana -> americano
+  {"na",4,"no",{"carona","abandona","lona","iona","cortisona","monótona","maratona","acetona","detona","guiana","campana","grana","caravana","banana","paisana"}},
+  # sozinha -> sozinho
+  {"inha",3,"inho",{"rainha","linha","minha"}},
+  # inglesa -> inglês
+  {"esa",3,"ês",{"mesa","obesa","princesa","turquesa","ilesa","pesa","presa"}},
+  # famosa -> famoso
+  {"osa",3,"oso",{"mucosa","prosa"}},
+  # maníaca -> maníaco
+  {"íaca",3,"íaco"},
+  # prática -> prático
+  {"ica",3,"ico",{"dica"}},
+  # cansada -> cansado
+  {"ada",2,"ado",{"pitada"}},
+  # mantida -> mantido
+  {"ida",3,"ido",{"vida","dúvida"}},
+  {"ída",3,"ido",{"recaída","saída"}},
+  # prima -> primo
+  {"ima",3,"imo",{"vítima"}},
+  # passiva -> passivo
+  {"iva",3,"ivo",{"saliva","oliva"}},
+  # primeira -> primeiro
+  {"eira",3,"eiro",{"beira","cadeira","frigideira","bandeira","feira","capoeira","barreira","fronteira","besteira","poeira"}}};
+
+# Step 4: Augmentative/Diminutive Reduction
+{  "Augmentative", 0, 1, {},
+  # cansadíssimo -> cansad
+  {"díssimo",5},
+  # amabilíssimo -> ama
+  {"abilíssimo",5},
+  # fortíssimo -> fort
+  {"íssimo",3},
+  {"ésimo",3},
+  # chiquérrimo -> chiqu
+  {"érrimo",4},
+  # pezinho -> pe
+  {"zinho",2},
+  # maluquinho -> maluc
+  {"quinho",4,"c"},
+  # amiguinho -> amig
+  {"uinho",4},
+  # cansadinho -> cansad
+  {"adinho",3},
+  # carrinho -> carr
+  {"inho",3,"",{"caminho","cominho"}},
+  # grandalhão -> grand
+  {"alhão",4},
+  # dentuça -> dent
+  {"uça",4},
+  # ricaço -> ric
+  {"aço",4,"",{"antebraço"}},
+  {"aça",4},
+  # casadão -> cans
+  {"adão",4}, 
+  {"idão",4},
+  # corpázio -> corp
+  {"ázio",3,"",{"topázio"}},
+  # pratarraz -> prat
+  {"arraz",4},
+  {"zarrão",3},
+  {"arrão",4},
+  # bocarra -> boc
+  {"arra",3},
+  # calorzão -> calor
+  {"zão",2,"",{"coalizão"}},
+  # meninão -> menin
+  {"ão",3,"",{"camarão","chimarrão","canção","coração","embrião","grotão","glutão","ficção","fogão","feição","furacão","gamão","lampião","leão","macacão","nação","órfão","orgão","patrão","portão","quinhão","rincão","tração","falcão","espião","mamão","folião","cordão","aptidão","campeão","colchão","limão","leilão","melão","barão","milhão","bilhão","fusão","cristão","ilusão","capitão","estação","senão"}}};
+
+# Step 5: Noun Suffix Reduction
+{  "Noun", 0, 0, {},
+  # existencialista -> exist
+  {"encialista",4},
+  # minimalista -> minim
+  {"alista",5},
+  # contagem -> cont
+  {"agem",3,"",{"coragem","chantagem","vantagem","carruagem"}},
+  # gerenciamento -> gerenc
+  {"iamento",4},
+  # monitoramento -> monitor
+  {"amento",3,"",{"firmamento","fundamento","departamento"}},
+  # nascimento -> nasc
+  {"imento",3},
+  {"mento",6,"",{"firmamento","elemento","complemento","instrumento","departamento"}},
+  # comercializado -> comerci
+  {"alizado",4},
+  # traumatizado -> traum
+  {"atizado",4},
+  {"tizado",4,"",{"alfabetizado"}},
+  # alfabetizado -> alfabet
+  {"izado",5,"",{"organizado","pulverizado"}},
+  # associativo -> associ
+  {"ativo",4,"",{"pejorativo","relativo"}},
+  # contraceptivo -> contracep
+  {"tivo",4,"",{"relativo"}},
+  # esportivo -> esport
+  {"ivo",4,"",{"passivo","possessivo","pejorativo","positivo"}},
+  # abalado -> abal
+  {"ado",2,"",{"grado"}},
+  # impedido -> imped
+  {"ido",3,"",{"cândido","consolido","rápido","decido","tímido","duvido","marido"}},
+  # ralador -> ral
+  {"ador",3},
+  # entendedor -> entend
+  {"edor",3},
+  # cumpridor -> cumpr
+  {"idor",4,"",{"ouvidor"}},
+  {"dor",4,"",{"ouvidor"}},
+  {"sor",4,"",{"assessor"}},
+  {"atoria",5},
+  {"tor",3,"",{"benfeitor","leitor","editor","pastor","produtor","promotor","consultor"}},
+  {"or",2,"",{"motor","melhor","redor","rigor","sensor","tambor","tumor","assessor","benfeitor","pastor","terior","favor","autor"}},
+  # comparabilidade -> compar
+  {"abilidade",5},
+  # abolicionista -> abol
+  {"icionista",4},
+  # intervencionista -> interven
+  {"cionista",5},
+  {"ionista",5},
+  {"ionar",5},
+  # profissional -> profiss
+  {"ional",4},
+  # referência -> refer
+  {"ência",3},
+  # repugnância -> repugn
+  {"ância",4,"",{"ambulância"}},
+  # abatedouro -> abat
+  {"edouro",3},
+  # fofoqueiro -> fofoc
+  {"queiro",3,"c"},
+  {"adeiro",4,"",{"desfiladeiro"}},
+  # brasileiro -> brasil
+  {"eiro",3,"",{"desfiladeiro","pioneiro","mosteiro"}},
+  {"uoso",3},
+  # gostoso -> gost
+  {"oso",3,"",{"precioso"}},
+  # comercializaç -> comerci
+  {"alizaç",5},
+  {"atizaç",5},
+  {"tizaç",5},
+  {"izaç",5,"",{"organizaç"}},
+  # alegaç -> aleg
+  {"aç",3,"",{"equaç","relaç"}},
+  # aboliç -> abol
+  {"iç",3,"",{"eleiç"}},
+  # anedotário -> anedot
+  {"ário",3,"",{"voluntário","salário","aniversário","diário","lionário","armário"}},
+  {"atório",3},
+  {"rio",5,"",{"voluntário","salário","aniversário","diário","compulsório","lionário","próprio","stério","armário"}},
+  # ministério -> minist
+  {"ério",6},
+  # chinês -> chin
+  {"ês",4},
+  # beleza -> bel
+  {"eza",3},
+  # rigidez -> rigid
+  {"ez",4},
+  # parentesco -> parent
+  {"esco",4},
+  # ocupante -> ocup
+  {"ante",2,"",{"gigante","elefante","adiante","possante","instante","restaurante"}},
+  # bombástico -> bomb
+  {"ástico",4,"",{"eclesiástico"}},
+  {"alístico",3},
+  {"áutico",4},
+  {"êutico",4},
+  {"tico",3,"",{"político","eclesiástico","diagnostico","prático","doméstico","diagnóstico","idêntico","alopático","artístico","autêntico","eclético","crítico","critico"}},
+  # polêmico -> polêm
+  {"ico",4,"",{"tico","público","explico"}},
+  # produtividade -> produt
+  {"ividade",5},
+  # profundidade -> profund
+  {"idade",4,"",{"autoridade","comunidade"}},
+  # aposentadoria -> aposentad
+  {"oria",4,"",{"categoria"}},
+  # existencial -> exist
+  {"encial",5},
+  # artista -> art
+  {"ista",4},
+  {"auta",5},
+  # maluquice -> maluc
+  {"quice",4,"c"},
+  # chatice -> chat
+  {"ice",4,"",{"cúmplice"}},
+  # demoníaco -> demon
+  {"íaco",3},
+  # decorrente -> decorr
+  {"ente",4,"",{"freqüente","alimente","acrescente","permanente","oriente","aparente"}},
+  {"ense",5},
+  # criminal -> crim
+  {"inal",3},
+  # americano -> americ
+  {"ano",4},
+  # amável -> am
+  {"ável",2,"",{"afável","razoável","potável","vulnerável"}},
+  # combustível -> combust
+  {"ível",3,"",{"possível"}},
+  {"vel",5,"",{"possível","vulnerável","solúvel"}},
+  {"bil",3,"vel"},
+  # cobertura -> cobert
+  {"ura",4,"",{"imatura","acupuntura","costura"}},
+  {"ural",4},
+  # consensual -> consens
+  {"ual",3,"",{"bissexual","virtual","visual","pontual"}},
+  # mundial -> mund
+  {"ial",3},
+  # experimental -> experiment
+  {"al",4,"",{"afinal","animal","estatal","bissexual","desleal","fiscal","formal","pessoal","liberal","postal","virtual","visual","pontual","sideral","sucursal"}},
+  {"alismo",4},
+  {"ivismo",4},
+  {"ismo",3,"",{"cinismo"}}};
+
+# Step 6: Verb Suffix Reduction
+{  "Verb", 0, 0, {}, 
+  # cantaríamo -> cant
+  {"aríamo",2},
+  # cantássemo -> cant
+  {"ássemo",2},
+  # beberíamo -> beb
+  {"eríamo",2},
+  # bebêssemo -> beb
+  {"êssemo",2},
+  # partiríamo -> part
+  {"iríamo",3},
+  # partíssemo -> part
+  {"íssemo",3},
+  # cantáramo -> cant
+  {"áramo",2},
+  # cantárei -> cant
+  {"árei",2},
+  # cantaremo -> cant
+  {"aremo",2},
+  # cantariam -> cant
+  {"ariam",2},
+  # cantaríei -> cant
+  {"aríei",2},
+  # cantássei -> cant
+  {"ássei",2},
+  # cantassem -> cant
+  {"assem",2},
+  # cantávamo -> cant
+  {"ávamo",2},
+  # bebêramo -> beb
+  {"êramo",3},
+  # beberemo -> beb
+  {"eremo",3},
+  # beberiam -> beb
+  {"eriam",3},
+  # beberíei -> beb
+  {"eríei",3},
+  # bebêssei -> beb
+  {"êssei",3},
+  # bebessem -> beb
+  {"essem",3},
+  # partiríamo -> part
+  {"íramo",3},
+  # partiremo -> part
+  {"iremo",3},
+  # partiriam -> part
+  {"iriam",3},
+  # partiríei -> part
+  {"iríei",3},
+  # partíssei -> part
+  {"íssei",3},
+  # partissem -> part
+  {"issem",3},
+  # cantando -> cant
+  {"ando",2},
+  # bebendo -> beb
+  {"endo",3},
+  # partindo -> part
+  {"indo",3},
+  # propondo -> prop
+  {"ondo",3},
+  # cantaram -> cant
+  {"aram",2},
+  {"arão",2},
+  # cantarde -> cant
+  {"arde",2},
+  # cantarei -> cant
+  {"arei",2},
+  # cantarem -> cant
+  {"arem",2},
+  # cantaria -> cant
+  {"aria",2},
+  # cantarmo -> cant
+  {"armo",2},
+  # cantasse -> cant
+  {"asse",2},
+  # cantaste -> cant
+  {"aste",2},
+  # cantavam -> cant
+  {"avam",2,"",{"agravam"}},
+  # cantávei -> cant
+  {"ávei",2},
+  # beberam -> beb
+  {"eram",3},
+  {"erão",3},
+  # beberde -> beb
+  {"erde",3},
+  # beberei -> beb
+  {"erei",3},
+  # bebêrei -> beb
+  {"êrei",3},
+  # beberem -> beb
+  {"erem",3},
+  # beberia -> beb
+  {"eria",3},
+  # bebermo -> beb
+  {"ermo",3},
+  # bebesse -> beb
+  {"esse",3},
+  # bebeste -> beb
+  {"este",3,"",{"faroeste","agreste"}},
+  # bebíamo -> beb
+  {"íamo",3},
+  # partiram -> part
+  {"iram",3},
+  # concluíram -> conclu
+  {"íram",3},
+  {"irão",2},
+  # partirde -> part
+  {"irde",2},
+  # partírei -> part
+  {"irei",3,"",{"admirei"}},
+  # partirem -> part
+  {"irem",3,"",{"adquirem"}},
+  # partiria -> part
+  {"iria",3},
+  # partirmo -> part
+  {"irmo",3},
+  # partisse -> part
+  {"isse",3},
+  # partiste -> part
+  {"iste",4},
+  {"iava",4,"",{"ampliava"}},
+  # cantamo -> cant
+  {"amo",2},
+  {"iona",3},
+  # cantara -> cant
+  {"ara",2,"",{"arara","prepara"}},
+  # cantará -> cant
+  {"ará",2,"",{"alvará"}},
+  # cantare -> cant
+  {"are",2,"",{"prepare"}},
+  # cantava -> cant
+  {"ava",2,"",{"agrava"}},
+  # cantemo -> cant
+  {"emo",2},
+  # bebera -> beb
+  {"era",3,"",{"acelera","espera"}},
+  # beberá -> beb
+  {"erá",3},
+  # bebere -> beb
+  {"ere",3,"",{"espere"}},
+  # bebiam -> beb
+  {"iam",3,"",{"enfiam","ampliam","elogiam","ensaiam"}},
+  # bebíei -> beb
+  {"íei",3},
+  # partimo -> part
+  {"imo",3,"",{"reprimo","intimo","íntimo","nimo","queimo","ximo"}},
+  # partira -> part
+  {"ira",3,"",{"fronteira","sátira"}},
+  {"ído",3},
+  # partirá -> part
+  {"irá",3},
+  {"tizar",4,"",{"alfabetizar"}},
+  {"izar",5,"",{"organizar"}},
+  {"itar",5,"",{"acreditar","explicitar","estreitar"}},
+  # partire -> part
+  {"ire",3,"",{"adquire"}},
+  # compomo -> comp
+  {"omo",3},
+  # cantai -> cant
+  {"ai",2},
+  # cantam -> cant
+  {"am",2},
+  # barbear -> barb
+  {"ear",4,"",{"alardear","nuclear"}},
+  # cantar -> cant
+  {"ar",2,"",{"azar","bazaar","patamar"}},
+  # cheguei -> cheg
+  {"uei",3},
+  {"uía",5,"u"},
+  # cantei -> cant
+  {"ei",3},
+  {"guem",3,"g"},
+  # cantem -> cant
+  {"em",2,"",{"alem","virgem"}},
+  # beber -> beb
+  {"er",2,"",{"éter","pier"}},
+  # bebeu -> beb
+  {"eu",3,"",{"chapeu"}},
+  # bebia -> beb
+  {"ia",3,"",{"estória","fatia","acia","praia","elogia","mania","lábia","aprecia","polícia","arredia","cheia","ásia"}},
+  # partir -> part
+  {"ir",3,"",{"freir"}},
+  # partiu -> part
+  {"iu",3},
+  {"eou",5},
+  # chegou -> cheg
+  {"ou",3},
+  # bebi -> beb
+  {"i",3}};
+
+# Step 7: Vowel Removal 
+{  "Vowel", 0, 0, {}, 
+  {"bil",2,"vel"},
+  {"gue",2,"g",{"gangue","jegue"}},
+  {"á",3}, 
+  {"ê",3,"",{"bebê"}},
+  # menina -> menin
+  {"a",3,"",{"ásia"}},
+  # grande -> grand
+  {"e",3},
+  # menino -> menin
+  {"o",3,"",{"ão"}}};
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/ro/stopwords.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/ro/stopwords.txt
@ -0,0 +1,233 @@
+# This file was created by Jacques Savoy and is distributed under the BSD license.
+# See http://members.unine.ch/jacques.savoy/clef/index.html.
+# Also see http://www.opensource.org/licenses/bsd-license.html
+acea
+aceasta
+această
+aceea
+acei
+aceia
+acel
+acela
+acele
+acelea
+acest
+acesta
+aceste
+acestea
+aceşti
+aceştia
+acolo
+acum
+ai
+aia
+aibă
+aici
+al
+ăla
+ale
+alea
+ălea
+altceva
+altcineva
+am
+ar
+are
+aş
+aşadar
+asemenea
+asta
+ăsta
+astăzi
+astea
+ăstea
+ăştia
+asupra
+aţi
+au
+avea
+avem
+aveţi
+azi
+bine
+bucur
+bună
+ca
+că
+căci
+când
+care
+cărei
+căror
+cărui
+cât
+câte
+câţi
+către
+câtva
+ce
+cel
+ceva
+chiar
+cînd
+cine
+cineva
+cît
+cîte
+cîţi
+cîtva
+contra
+cu
+cum
+cumva
+curând
+curînd
+da
+dă
+dacă
+dar
+datorită
+de
+deci
+deja
+deoarece
+departe
+deşi
+din
+dinaintea
+dintr
+dintre
+drept
+după
+ea
+ei
+el
+ele
+eram
+este
+eşti
+eu
+face
+fără
+fi
+fie
+fiecare
+fii
+fim
+fiţi
+iar
+ieri
+îi
+îl
+îmi
+împotriva
+în 
+înainte
+înaintea
+încât
+încît
+încotro
+între
+întrucât
+întrucît
+îţi
+la
+lângă
+le
+li
+lîngă
+lor
+lui
+mă
+mâine
+mea
+mei
+mele
+mereu
+meu
+mi
+mine
+mult
+multă
+mulţi
+ne
+nicăieri
+nici
+nimeni
+nişte
+noastră
+noastre
+noi
+noştri
+nostru
+nu
+ori
+oricând
+oricare
+oricât
+orice
+oricînd
+oricine
+oricît
+oricum
+oriunde
+până
+pe
+pentru
+peste
+pînă
+poate
+pot
+prea
+prima
+primul
+prin
+printr
+sa
+să
+săi
+sale
+sau
+său
+se
+şi
+sînt
+sîntem
+sînteţi
+spre
+sub
+sunt
+suntem
+sunteţi
+ta
+tăi
+tale
+tău
+te
+ţi
+ţie
+tine
+toată
+toate
+tot
+toţi
+totuşi
+tu
+un
+una
+unde
+undeva
+unei
+unele
+uneori
+unor
+vă
+vi
+voastră
+voastre
+voi
+voştri
+vostru
+vouă
+vreo
+vreun
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/danish_stop.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/danish_stop.txt
@ -0,0 +1,108 @@
+ | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt
+ | This file is distributed under the BSD License.
+ | See http://snowball.tartarus.org/license.php
+ | Also see http://www.opensource.org/licenses/bsd-license.html
+ |  - Encoding was converted to UTF-8.
+ |  - This notice was added.
+
+ | A Danish stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+
+ | This is a ranked list (commonest to rarest) of stopwords derived from
+ | a large text sample.
+
+
+og           | and
+i            | in
+jeg          | I
+det          | that (dem. pronoun)/it (pers. pronoun)
+at           | that (in front of a sentence)/to (with infinitive)
+en           | a/an
+den          | it (pers. pronoun)/that (dem. pronoun)
+til          | to/at/for/until/against/by/of/into, more
+er           | present tense of "to be"
+som          | who, as
+på           | on/upon/in/on/at/to/after/of/with/for, on
+de           | they
+med          | with/by/in, along
+han          | he
+af           | of/by/from/off/for/in/with/on, off
+for          | at/for/to/from/by/of/ago, in front/before, because
+ikke         | not
+der          | who/which, there/those
+var          | past tense of "to be"
+mig          | me/myself
+sig          | oneself/himself/herself/itself/themselves
+men          | but
+et           | a/an/one, one (number), someone/somebody/one
+har          | present tense of "to have"
+om           | round/about/for/in/a, about/around/down, if
+vi           | we
+min          | my
+havde        | past tense of "to have"
+ham          | him
+hun          | she
+nu           | now
+over         | over/above/across/by/beyond/past/on/about, over/past
+da           | then, when/as/since
+fra          | from/off/since, off, since
+du           | you
+ud           | out
+sin          | his/her/its/one's
+dem          | them
+os           | us/ourselves
+op           | up
+man          | you/one
+hans         | his
+hvor         | where
+eller        | or
+hvad         | what
+skal         | must/shall etc.
+selv         | myself/youself/herself/ourselves etc., even
+her          | here
+alle         | all/everyone/everybody etc.
+vil          | will (verb)
+blev         | past tense of "to stay/to remain/to get/to become"
+kunne        | could
+ind          | in
+når          | when
+være         | present tense of "to be"
+dog          | however/yet/after all
+noget        | something
+ville        | would
+jo           | you know/you see (adv), yes
+deres        | their/theirs
+efter        | after/behind/according to/for/by/from, later/afterwards
+ned          | down
+skulle       | should
+denne        | this
+end          | than
+dette        | this
+mit          | my/mine
+også         | also
+under        | under/beneath/below/during, below/underneath
+have         | have
+dig          | you
+anden        | other
+hende        | her
+mine         | my
+alt          | everything
+meget        | much/very, plenty of
+sit          | his, her, its, one's
+sine         | his, her, its, one's
+vor          | our
+mod          | against
+disse        | these
+hvis         | if
+din          | your/yours
+nogle        | some
+hos          | by/at
+blive        | be/become
+mange        | many
+ad           | by/through
+bliver       | present tense of "to be/to become"
+hendes       | her/hers
+været        | be
+thi          | for (conj)
+jer          | you
+sådan        | such, like this/like that
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/dutch_stop.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/dutch_stop.txt
@ -0,0 +1,117 @@
+ | From svn.tartarus.org/snowball/trunk/website/algorithms/dutch/stop.txt
+ | This file is distributed under the BSD License.
+ | See http://snowball.tartarus.org/license.php
+ | Also see http://www.opensource.org/licenses/bsd-license.html
+ |  - Encoding was converted to UTF-8.
+ |  - This notice was added.
+
+ | A Dutch stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+
+ | This is a ranked list (commonest to rarest) of stopwords derived from
+ | a large sample of Dutch text.
+
+ | Dutch stop words frequently exhibit homonym clashes. These are indicated
+ | clearly below.
+
+de             |  the
+en             |  and
+van            |  of, from
+ik             |  I, the ego
+te             |  (1) chez, at etc, (2) to, (3) too
+dat            |  that, which
+die            |  that, those, who, which
+in             |  in, inside
+een            |  a, an, one
+hij            |  he
+het            |  the, it
+niet           |  not, nothing, naught
+zijn           |  (1) to be, being, (2) his, one's, its
+is             |  is
+was            |  (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river
+op             |  on, upon, at, in, up, used up
+aan            |  on, upon, to (as dative)
+met            |  with, by
+als            |  like, such as, when
+voor           |  (1) before, in front of, (2) furrow
+had            |  had, past tense all persons sing. of 'hebben' (have)
+er             |  there
+maar           |  but, only
+om             |  round, about, for etc
+hem            |  him
+dan            |  then
+zou            |  should/would, past tense all persons sing. of 'zullen'
+of             |  or, whether, if
+wat            |  what, something, anything
+mijn           |  possessive and noun 'mine'
+men            |  people, 'one'
+dit            |  this
+zo             |  so, thus, in this way
+door           |  through by
+over           |  over, across
+ze             |  she, her, they, them
+zich           |  oneself
+bij            |  (1) a bee, (2) by, near, at
+ook            |  also, too
+tot            |  till, until
+je             |  you
+mij            |  me
+uit            |  out of, from
+der            |  Old Dutch form of 'van der' still found in surnames
+daar           |  (1) there, (2) because
+haar           |  (1) her, their, them, (2) hair
+naar           |  (1) unpleasant, unwell etc, (2) towards, (3) as
+heb            |  present first person sing. of 'to have'
+hoe            |  how, why
+heeft          |  present third person sing. of 'to have'
+hebben         |  'to have' and various parts thereof
+deze           |  this
+u              |  you
+want           |  (1) for, (2) mitten, (3) rigging
+nog            |  yet, still
+zal            |  'shall', first and third person sing. of verb 'zullen' (will)
+me             |  me
+zij            |  she, they
+nu             |  now
+ge             |  'thou', still used in Belgium and south Netherlands
+geen           |  none
+omdat          |  because
+iets           |  something, somewhat
+worden         |  to become, grow, get
+toch           |  yet, still
+al             |  all, every, each
+waren          |  (1) 'were' (2) to wander, (3) wares, (3)
+veel           |  much, many
+meer           |  (1) more, (2) lake
+doen           |  to do, to make
+toen           |  then, when
+moet           |  noun 'spot/mote' and present form of 'to must'
+ben            |  (1) am, (2) 'are' in interrogative second person singular of 'to be'
+zonder         |  without
+kan            |  noun 'can' and present form of 'to be able'
+hun            |  their, them
+dus            |  so, consequently
+alles          |  all, everything, anything
+onder          |  under, beneath
+ja             |  yes, of course
+eens           |  once, one day
+hier           |  here
+wie            |  who
+werd           |  imperfect third person sing. of 'become'
+altijd         |  always
+doch           |  yet, but etc
+wordt          |  present third person sing. of 'become'
+wezen          |  (1) to be, (2) 'been' as in 'been fishing', (3) orphans
+kunnen         |  to be able
+ons            |  us/our
+zelf           |  self
+tegen          |  against, towards, at
+na             |  after, near
+reeds          |  already
+wil            |  (1) present tense of 'want', (2) 'will', noun, (3) fender
+kon            |  could; past tense of 'to be able'
+niets          |  nothing
+uw             |  your
+iemand         |  somebody
+geweest        |  been; past participle of 'be'
+andere         |  other
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/english_stop.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/english_stop.txt
@ -0,0 +1,317 @@
+ | From svn.tartarus.org/snowball/trunk/website/algorithms/english/stop.txt
+ | This file is distributed under the BSD License.
+ | See http://snowball.tartarus.org/license.php
+ | Also see http://www.opensource.org/licenses/bsd-license.html
+ |  - Encoding was converted to UTF-8.
+ |  - This notice was added.
+ 
+ | An English stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+
+ | Many of the forms below are quite rare (e.g. "yourselves") but included for
+ |  completeness.
+
+           | PRONOUNS FORMS
+             | 1st person sing
+
+i              | subject, always in upper case of course
+
+me             | object
+my             | possessive adjective
+               | the possessive pronoun `mine' is best suppressed, because of the
+               | sense of coal-mine etc.
+myself         | reflexive
+             | 1st person plural
+we             | subject
+
+| us           | object
+               | care is required here because US = United States. It is usually
+               | safe to remove it if it is in lower case.
+our            | possessive adjective
+ours           | possessive pronoun
+ourselves      | reflexive
+             | second person (archaic `thou' forms not included)
+you            | subject and object
+your           | possessive adjective
+yours          | possessive pronoun
+yourself       | reflexive (singular)
+yourselves     | reflexive (plural)
+             | third person singular
+he             | subject
+him            | object
+his            | possessive adjective and pronoun
+himself        | reflexive
+
+she            | subject
+her            | object and possessive adjective
+hers           | possessive pronoun
+herself        | reflexive
+
+it             | subject and object
+its            | possessive adjective
+itself         | reflexive
+             | third person plural
+they           | subject
+them           | object
+their          | possessive adjective
+theirs         | possessive pronoun
+themselves     | reflexive
+             | other forms (demonstratives, interrogatives)
+what
+which
+who
+whom
+this
+that
+these
+those
+
+           | VERB FORMS (using F.R. Palmer's nomenclature)
+             | BE
+am             | 1st person, present
+is             | -s form (3rd person, present)
+are            | present
+was            | 1st person, past
+were           | past
+be             | infinitive
+been           | past participle
+being          | -ing form
+             | HAVE
+have           | simple
+has            | -s form
+had            | past
+having         | -ing form
+             | DO
+do             | simple
+does           | -s form
+did            | past
+doing          | -ing form
+
+ | The forms below are, I believe, best omitted, because of the significant
+ | homonym forms:
+
+ |  He made a WILL
+ |  old tin CAN
+ |  merry month of MAY
+ |  a smell of MUST
+ |  fight the good fight with all thy MIGHT
+
+ | would, could, should, ought might however be included
+
+ |          | AUXILIARIES
+ |            | WILL
+ |will
+
+would
+
+ |            | SHALL
+ |shall
+
+should
+
+ |            | CAN
+ |can
+
+could
+
+ |            | MAY
+ |may
+ |might
+ |            | MUST
+ |must
+ |            | OUGHT
+
+ought
+
+           | COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing
+              | pronoun + verb
+
+i'm
+you're
+he's
+she's
+it's
+we're
+they're
+i've
+you've
+we've
+they've
+i'd
+you'd
+he'd
+she'd
+we'd
+they'd
+i'll
+you'll
+he'll
+she'll
+we'll
+they'll
+
+              | verb + negation
+
+isn't
+aren't
+wasn't
+weren't
+hasn't
+haven't
+hadn't
+doesn't
+don't
+didn't
+
+              | auxiliary + negation
+
+won't
+wouldn't
+shan't
+shouldn't
+can't
+cannot
+couldn't
+mustn't
+
+             | miscellaneous forms
+
+let's
+that's
+who's
+what's
+here's
+there's
+when's
+where's
+why's
+how's
+
+              | rarer forms
+
+ | daren't needn't
+
+              | doubtful forms
+
+ | oughtn't mightn't
+
+           | ARTICLES
+a
+an
+the
+
+           | THE REST (Overlap among prepositions, conjunctions, adverbs etc is so
+           | high, that classification is pointless.)
+and
+but
+if
+or
+because
+as
+until
+while
+
+of
+at
+by
+for
+with
+about
+against
+between
+into
+through
+during
+before
+after
+above
+below
+to
+from
+up
+down
+in
+out
+on
+off
+over
+under
+
+again
+further
+then
+once
+
+here
+there
+when
+where
+why
+how
+
+all
+any
+both
+each
+few
+more
+most
+other
+some
+such
+
+no
+nor
+not
+only
+own
+same
+so
+than
+too
+very
+
+ | Just for the record, the following words are among the commonest in English
+
+    | one
+    | every
+    | least
+    | less
+    | many
+    | now
+    | ever
+    | never
+    | say
+    | says
+    | said
+    | also
+    | get
+    | go
+    | goes
+    | just
+    | made
+    | make
+    | put
+    | see
+    | seen
+    | whether
+    | like
+    | well
+    | back
+    | even
+    | still
+    | way
+    | take
+    | since
+    | another
+    | however
+    | two
+    | three
+    | four
+    | five
+    | first
+    | second
+    | new
+    | old
+    | high
+    | long
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/finnish_stop.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/finnish_stop.txt
@ -0,0 +1,95 @@
+ | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt
+ | This file is distributed under the BSD License.
+ | See http://snowball.tartarus.org/license.php
+ | Also see http://www.opensource.org/licenses/bsd-license.html
+ |  - Encoding was converted to UTF-8.
+ |  - This notice was added.
+ 
+| forms of BE
+
+olla
+olen
+olet
+on
+olemme
+olette
+ovat
+ole        | negative form
+
+oli
+olisi
+olisit
+olisin
+olisimme
+olisitte
+olisivat
+olit
+olin
+olimme
+olitte
+olivat
+ollut
+olleet
+
+en         | negation
+et
+ei
+emme
+ette
+eivät
+
+|Nom   Gen    Acc    Part   Iness   Elat    Illat  Adess   Ablat   Allat   Ess    Trans
+minä   minun  minut  minua  minussa minusta minuun minulla minulta minulle               | I
+sinä   sinun  sinut  sinua  sinussa sinusta sinuun sinulla sinulta sinulle               | you
+hän    hänen  hänet  häntä  hänessä hänestä häneen hänellä häneltä hänelle               | he she
+me     meidän meidät meitä  meissä  meistä  meihin meillä  meiltä  meille                | we
+te     teidän teidät teitä  teissä  teistä  teihin teillä  teiltä  teille                | you
+he     heidän heidät heitä  heissä  heistä  heihin heillä  heiltä  heille                | they
+
+tämä   tämän         tätä   tässä   tästä   tähän  tallä   tältä   tälle   tänä   täksi  | this
+tuo    tuon          tuotä  tuossa  tuosta  tuohon tuolla  tuolta  tuolle  tuona  tuoksi | that
+se     sen           sitä   siinä   siitä   siihen sillä   siltä   sille   sinä   siksi  | it
+nämä   näiden        näitä  näissä  näistä  näihin näillä  näiltä  näille  näinä  näiksi | these
+nuo    noiden        noita  noissa  noista  noihin noilla  noilta  noille  noina  noiksi | those
+ne     niiden        niitä  niissä  niistä  niihin niillä  niiltä  niille  niinä  niiksi | they
+
+kuka   kenen kenet   ketä   kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who
+ketkä  keiden ketkä  keitä  keissä  keistä  keihin keillä  keiltä  keille  keinä  keiksi | (pl)
+mikä   minkä minkä   mitä   missä   mistä   mihin  millä   miltä   mille   minä   miksi  | which what
+mitkä                                                                                    | (pl)
+
+joka   jonka         jota   jossa   josta   johon  jolla   jolta   jolle   jona   joksi  | who which
+jotka  joiden        joita  joissa  joista  joihin joilla  joilta  joille  joina  joiksi | (pl)
+
+| conjunctions
+
+että   | that
+ja     | and
+jos    | if
+koska  | because
+kuin   | than
+mutta  | but
+niin   | so
+sekä   | and
+sillä  | for
+tai    | or
+vaan   | but
+vai    | or
+vaikka | although
+
+
+| prepositions
+
+kanssa  | with
+mukaan  | according to
+noin    | about
+poikki  | across
+yli     | over, across
+
+| other
+
+kun    | when
+niin   | so
+nyt    | now
+itse   | self
+
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/french_stop.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/french_stop.txt
@ -0,0 +1,183 @@
+ | From svn.tartarus.org/snowball/trunk/website/algorithms/french/stop.txt
+ | This file is distributed under the BSD License.
+ | See http://snowball.tartarus.org/license.php
+ | Also see http://www.opensource.org/licenses/bsd-license.html
+ |  - Encoding was converted to UTF-8.
+ |  - This notice was added.
+
+ | A French stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+
+au             |  a + le
+aux            |  a + les
+avec           |  with
+ce             |  this
+ces            |  these
+dans           |  with
+de             |  of
+des            |  de + les
+du             |  de + le
+elle           |  she
+en             |  `of them' etc
+et             |  and
+eux            |  them
+il             |  he
+je             |  I
+la             |  the
+le             |  the
+leur           |  their
+lui            |  him
+ma             |  my (fem)
+mais           |  but
+me             |  me
+même           |  same; as in moi-même (myself) etc
+mes            |  me (pl)
+moi            |  me
+mon            |  my (masc)
+ne             |  not
+nos            |  our (pl)
+notre          |  our
+nous           |  we
+on             |  one
+ou             |  where
+par            |  by
+pas            |  not
+pour           |  for
+qu             |  que before vowel
+que            |  that
+qui            |  who
+sa             |  his, her (fem)
+se             |  oneself
+ses            |  his (pl)
+son            |  his, her (masc)
+sur            |  on
+ta             |  thy (fem)
+te             |  thee
+tes            |  thy (pl)
+toi            |  thee
+ton            |  thy (masc)
+tu             |  thou
+un             |  a
+une            |  a
+vos            |  your (pl)
+votre          |  your
+vous           |  you
+
+               |  single letter forms
+
+c              |  c'
+d              |  d'
+j              |  j'
+l              |  l'
+à              |  to, at
+m              |  m'
+n              |  n'
+s              |  s'
+t              |  t'
+y              |  there
+
+               | forms of être (not including the infinitive):
+été
+étée
+étées
+étés
+étant
+suis
+es
+est
+sommes
+êtes
+sont
+serai
+seras
+sera
+serons
+serez
+seront
+serais
+serait
+serions
+seriez
+seraient
+étais
+était
+étions
+étiez
+étaient
+fus
+fut
+fûmes
+fûtes
+furent
+sois
+soit
+soyons
+soyez
+soient
+fusse
+fusses
+fût
+fussions
+fussiez
+fussent
+
+               | forms of avoir (not including the infinitive):
+ayant
+eu
+eue
+eues
+eus
+ai
+as
+avons
+avez
+ont
+aurai
+auras
+aura
+aurons
+aurez
+auront
+aurais
+aurait
+aurions
+auriez
+auraient
+avais
+avait
+avions
+aviez
+avaient
+eut
+eûmes
+eûtes
+eurent
+aie
+aies
+ait
+ayons
+ayez
+aient
+eusse
+eusses
+eût
+eussions
+eussiez
+eussent
+
+               | Later additions (from Jean-Christophe Deschamps)
+ceci           |  this
+celà           |  that
+cet            |  this
+cette          |  this
+ici            |  here
+ils            |  they
+les            |  the (pl)
+leurs          |  their (pl)
+quel           |  which
+quels          |  which
+quelle         |  which
+quelles        |  which
+sans           |  without
+soi            |  oneself
+
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/german_stop.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/german_stop.txt
@ -0,0 +1,292 @@
+ | From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt
+ | This file is distributed under the BSD License.
+ | See http://snowball.tartarus.org/license.php
+ | Also see http://www.opensource.org/licenses/bsd-license.html
+ |  - Encoding was converted to UTF-8.
+ |  - This notice was added.
+
+ | A German stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+
+ | The number of forms in this list is reduced significantly by passing it
+ | through the German stemmer.
+
+
+aber           |  but
+
+alle           |  all
+allem
+allen
+aller
+alles
+
+als            |  than, as
+also           |  so
+am             |  an + dem
+an             |  at
+
+ander          |  other
+andere
+anderem
+anderen
+anderer
+anderes
+anderm
+andern
+anderr
+anders
+
+auch           |  also
+auf            |  on
+aus            |  out of
+bei            |  by
+bin            |  am
+bis            |  until
+bist           |  art
+da             |  there
+damit          |  with it
+dann           |  then
+
+der            |  the
+den
+des
+dem
+die
+das
+
+daß            |  that
+
+derselbe       |  the same
+derselben
+denselben
+desselben
+demselben
+dieselbe
+dieselben
+dasselbe
+
+dazu           |  to that
+
+dein           |  thy
+deine
+deinem
+deinen
+deiner
+deines
+
+denn           |  because
+
+derer          |  of those
+dessen         |  of him
+
+dich           |  thee
+dir            |  to thee
+du             |  thou
+
+dies           |  this
+diese
+diesem
+diesen
+dieser
+dieses
+
+
+doch           |  (several meanings)
+dort           |  (over) there
+
+
+durch          |  through
+
+ein            |  a
+eine
+einem
+einen
+einer
+eines
+
+einig          |  some
+einige
+einigem
+einigen
+einiger
+einiges
+
+einmal         |  once
+
+er             |  he
+ihn            |  him
+ihm            |  to him
+
+es             |  it
+etwas          |  something
+
+euer           |  your
+eure
+eurem
+euren
+eurer
+eures
+
+für            |  for
+gegen          |  towards
+gewesen        |  p.p. of sein
+hab            |  have
+habe           |  have
+haben          |  have
+hat            |  has
+hatte          |  had
+hatten         |  had
+hier           |  here
+hin            |  there
+hinter         |  behind
+
+ich            |  I
+mich           |  me
+mir            |  to me
+
+
+ihr            |  you, to her
+ihre
+ihrem
+ihren
+ihrer
+ihres
+euch           |  to you
+
+im             |  in + dem
+in             |  in
+indem          |  while
+ins            |  in + das
+ist            |  is
+
+jede           |  each, every
+jedem
+jeden
+jeder
+jedes
+
+jene           |  that
+jenem
+jenen
+jener
+jenes
+
+jetzt          |  now
+kann           |  can
+
+kein           |  no
+keine
+keinem
+keinen
+keiner
+keines
+
+können         |  can
+könnte         |  could
+machen         |  do
+man            |  one
+
+manche         |  some, many a
+manchem
+manchen
+mancher
+manches
+
+mein           |  my
+meine
+meinem
+meinen
+meiner
+meines
+
+mit            |  with
+muss           |  must
+musste         |  had to
+nach           |  to(wards)
+nicht          |  not
+nichts         |  nothing
+noch           |  still, yet
+nun            |  now
+nur            |  only
+ob             |  whether
+oder           |  or
+ohne           |  without
+sehr           |  very
+
+sein           |  his
+seine
+seinem
+seinen
+seiner
+seines
+
+selbst         |  self
+sich           |  herself
+
+sie            |  they, she
+ihnen          |  to them
+
+sind           |  are
+so             |  so
+
+solche         |  such
+solchem
+solchen
+solcher
+solches
+
+soll           |  shall
+sollte         |  should
+sondern        |  but
+sonst          |  else
+über           |  over
+um             |  about, around
+und            |  and
+
+uns            |  us
+unse
+unsem
+unsen
+unser
+unses
+
+unter          |  under
+viel           |  much
+vom            |  von + dem
+von            |  from
+vor            |  before
+während        |  while
+war            |  was
+waren          |  were
+warst          |  wast
+was            |  what
+weg            |  away, off
+weil           |  because
+weiter         |  further
+
+welche         |  which
+welchem
+welchen
+welcher
+welches
+
+wenn           |  when
+werde          |  will
+werden         |  will
+wie            |  how
+wieder         |  again
+will           |  want
+wir            |  we
+wird           |  will
+wirst          |  willst
+wo             |  where
+wollen         |  want
+wollte         |  wanted
+würde          |  would
+würden         |  would
+zu             |  to
+zum            |  zu + dem
+zur            |  zu + der
+zwar           |  indeed
+zwischen       |  between
+
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/hungarian_stop.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/hungarian_stop.txt
@ -0,0 +1,209 @@
+ | From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt
+ | This file is distributed under the BSD License.
+ | See http://snowball.tartarus.org/license.php
+ | Also see http://www.opensource.org/licenses/bsd-license.html
+ |  - Encoding was converted to UTF-8.
+ |  - This notice was added.
+ 
+| Hungarian stop word list
+| prepared by Anna Tordai
+
+a
+ahogy
+ahol
+aki
+akik
+akkor
+alatt
+által
+általában
+amely
+amelyek
+amelyekben
+amelyeket
+amelyet
+amelynek
+ami
+amit
+amolyan
+amíg
+amikor
+át
+abban
+ahhoz
+annak
+arra
+arról
+az
+azok
+azon
+azt
+azzal
+azért
+aztán
+azután
+azonban
+bár
+be
+belül
+benne
+cikk
+cikkek
+cikkeket
+csak
+de
+e
+eddig
+egész
+egy
+egyes
+egyetlen
+egyéb
+egyik
+egyre
+ekkor
+el
+elég
+ellen
+elő
+először
+előtt
+első
+én
+éppen
+ebben
+ehhez
+emilyen
+ennek
+erre
+ez
+ezt
+ezek
+ezen
+ezzel
+ezért
+és
+fel
+felé
+hanem
+hiszen
+hogy
+hogyan
+igen
+így
+illetve
+ill.
+ill
+ilyen
+ilyenkor
+ison
+ismét
+itt
+jó
+jól
+jobban
+kell
+kellett
+keresztül
+keressünk
+ki
+kívül
+között
+közül
+legalább
+lehet
+lehetett
+legyen
+lenne
+lenni
+lesz
+lett
+maga
+magát
+majd
+majd
+már
+más
+másik
+meg
+még
+mellett
+mert
+mely
+melyek
+mi
+mit
+míg
+miért
+milyen
+mikor
+minden
+mindent
+mindenki
+mindig
+mint
+mintha
+mivel
+most
+nagy
+nagyobb
+nagyon
+ne
+néha
+nekem
+neki
+nem
+néhány
+nélkül
+nincs
+olyan
+ott
+össze
+ő
+ők
+őket
+pedig
+persze
+rá
+s
+saját
+sem
+semmi
+sok
+sokat
+sokkal
+számára
+szemben
+szerint
+szinte
+talán
+tehát
+teljes
+tovább
+továbbá
+több
+úgy
+ugyanis
+új
+újabb
+újra
+után
+utána
+utolsó
+vagy
+vagyis
+valaki
+valami
+valamint
+való
+vagyok
+van
+vannak
+volt
+voltam
+voltak
+voltunk
+vissza
+vele
+viszont
+volna
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/italian_stop.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/italian_stop.txt
@ -0,0 +1,301 @@
+ | From svn.tartarus.org/snowball/trunk/website/algorithms/italian/stop.txt
+ | This file is distributed under the BSD License.
+ | See http://snowball.tartarus.org/license.php
+ | Also see http://www.opensource.org/licenses/bsd-license.html
+ |  - Encoding was converted to UTF-8.
+ |  - This notice was added.
+
+ | An Italian stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+
+ad             |  a (to) before vowel
+al             |  a + il
+allo           |  a + lo
+ai             |  a + i
+agli           |  a + gli
+all            |  a + l'
+agl            |  a + gl'
+alla           |  a + la
+alle           |  a + le
+con            |  with
+col            |  con + il
+coi            |  con + i (forms collo, cogli etc are now very rare)
+da             |  from
+dal            |  da + il
+dallo          |  da + lo
+dai            |  da + i
+dagli          |  da + gli
+dall           |  da + l'
+dagl           |  da + gll'
+dalla          |  da + la
+dalle          |  da + le
+di             |  of
+del            |  di + il
+dello          |  di + lo
+dei            |  di + i
+degli          |  di + gli
+dell           |  di + l'
+degl           |  di + gl'
+della          |  di + la
+delle          |  di + le
+in             |  in
+nel            |  in + el
+nello          |  in + lo
+nei            |  in + i
+negli          |  in + gli
+nell           |  in + l'
+negl           |  in + gl'
+nella          |  in + la
+nelle          |  in + le
+su             |  on
+sul            |  su + il
+sullo          |  su + lo
+sui            |  su + i
+sugli          |  su + gli
+sull           |  su + l'
+sugl           |  su + gl'
+sulla          |  su + la
+sulle          |  su + le
+per            |  through, by
+tra            |  among
+contro         |  against
+io             |  I
+tu             |  thou
+lui            |  he
+lei            |  she
+noi            |  we
+voi            |  you
+loro           |  they
+mio            |  my
+mia            |
+miei           |
+mie            |
+tuo            |
+tua            |
+tuoi           |  thy
+tue            |
+suo            |
+sua            |
+suoi           |  his, her
+sue            |
+nostro         |  our
+nostra         |
+nostri         |
+nostre         |
+vostro         |  your
+vostra         |
+vostri         |
+vostre         |
+mi             |  me
+ti             |  thee
+ci             |  us, there
+vi             |  you, there
+lo             |  him, the
+la             |  her, the
+li             |  them
+le             |  them, the
+gli            |  to him, the
+ne             |  from there etc
+il             |  the
+un             |  a
+uno            |  a
+una            |  a
+ma             |  but
+ed             |  and
+se             |  if
+perché         |  why, because
+anche          |  also
+come           |  how
+dov            |  where (as dov')
+dove           |  where
+che            |  who, that
+chi            |  who
+cui            |  whom
+non            |  not
+più            |  more
+quale          |  who, that
+quanto         |  how much
+quanti         |
+quanta         |
+quante         |
+quello         |  that
+quelli         |
+quella         |
+quelle         |
+questo         |  this
+questi         |
+questa         |
+queste         |
+si             |  yes
+tutto          |  all
+tutti          |  all
+
+               |  single letter forms:
+
+a              |  at
+c              |  as c' for ce or ci
+e              |  and
+i              |  the
+l              |  as l'
+o              |  or
+
+               | forms of avere, to have (not including the infinitive):
+
+ho
+hai
+ha
+abbiamo
+avete
+hanno
+abbia
+abbiate
+abbiano
+avrò
+avrai
+avrà
+avremo
+avrete
+avranno
+avrei
+avresti
+avrebbe
+avremmo
+avreste
+avrebbero
+avevo
+avevi
+aveva
+avevamo
+avevate
+avevano
+ebbi
+avesti
+ebbe
+avemmo
+aveste
+ebbero
+avessi
+avesse
+avessimo
+avessero
+avendo
+avuto
+avuta
+avuti
+avute
+
+               | forms of essere, to be (not including the infinitive):
+sono
+sei
+è
+siamo
+siete
+sia
+siate
+siano
+sarò
+sarai
+sarà
+saremo
+sarete
+saranno
+sarei
+saresti
+sarebbe
+saremmo
+sareste
+sarebbero
+ero
+eri
+era
+eravamo
+eravate
+erano
+fui
+fosti
+fu
+fummo
+foste
+furono
+fossi
+fosse
+fossimo
+fossero
+essendo
+
+               | forms of fare, to do (not including the infinitive, fa, fat-):
+faccio
+fai
+facciamo
+fanno
+faccia
+facciate
+facciano
+farò
+farai
+farà
+faremo
+farete
+faranno
+farei
+faresti
+farebbe
+faremmo
+fareste
+farebbero
+facevo
+facevi
+faceva
+facevamo
+facevate
+facevano
+feci
+facesti
+fece
+facemmo
+faceste
+fecero
+facessi
+facesse
+facessimo
+facessero
+facendo
+
+               | forms of stare, to be (not including the infinitive):
+sto
+stai
+sta
+stiamo
+stanno
+stia
+stiate
+stiano
+starò
+starai
+starà
+staremo
+starete
+staranno
+starei
+staresti
+starebbe
+staremmo
+stareste
+starebbero
+stavo
+stavi
+stava
+stavamo
+stavate
+stavano
+stetti
+stesti
+stette
+stemmo
+steste
+stettero
+stessi
+stesse
+stessimo
+stessero
+stando
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/norwegian_stop.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/norwegian_stop.txt
@ -0,0 +1,192 @@
+ | From svn.tartarus.org/snowball/trunk/website/algorithms/norwegian/stop.txt
+ | This file is distributed under the BSD License.
+ | See http://snowball.tartarus.org/license.php
+ | Also see http://www.opensource.org/licenses/bsd-license.html
+ |  - Encoding was converted to UTF-8.
+ |  - This notice was added.
+
+ | A Norwegian stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+
+ | This stop word list is for the dominant bokmål dialect. Words unique
+ | to nynorsk are marked *.
+
+ | Revised by Jan Bruusgaard <Jan.Bruusgaard@ssb.no>, Jan 2005
+
+og             | and
+i              | in
+jeg            | I
+det            | it/this/that
+at             | to (w. inf.)
+en             | a/an
+et             | a/an
+den            | it/this/that
+til            | to
+er             | is/am/are
+som            | who/that
+på             | on
+de             | they / you(formal)
+med            | with
+han            | he
+av             | of
+ikke           | not
+ikkje          | not *
+der            | there
+så             | so
+var            | was/were
+meg            | me
+seg            | you
+men            | but
+ett            | one
+har            | have
+om             | about
+vi             | we
+min            | my
+mitt           | my
+ha             | have
+hadde          | had
+hun            | she
+nå             | now
+over           | over
+da             | when/as
+ved            | by/know
+fra            | from
+du             | you
+ut             | out
+sin            | your
+dem            | them
+oss            | us
+opp            | up
+man            | you/one
+kan            | can
+hans           | his
+hvor           | where
+eller          | or
+hva            | what
+skal           | shall/must
+selv           | self (reflective)
+sjøl           | self (reflective)
+her            | here
+alle           | all
+vil            | will
+bli            | become
+ble            | became
+blei           | became *
+blitt          | have become
+kunne          | could
+inn            | in
+når            | when
+være           | be
+kom            | come
+noen           | some
+noe            | some
+ville          | would
+dere           | you
+som            | who/which/that
+deres          | their/theirs
+kun            | only/just
+ja             | yes
+etter          | after
+ned            | down
+skulle         | should
+denne          | this
+for            | for/because
+deg            | you
+si             | hers/his
+sine           | hers/his
+sitt           | hers/his
+mot            | against
+å              | to
+meget          | much
+hvorfor        | why
+dette          | this
+disse          | these/those
+uten           | without
+hvordan        | how
+ingen          | none
+din            | your
+ditt           | your
+blir           | become
+samme          | same
+hvilken        | which
+hvilke         | which (plural)
+sånn           | such a
+inni           | inside/within
+mellom         | between
+vår            | our
+hver           | each
+hvem           | who
+vors           | us/ours
+hvis           | whose
+både           | both
+bare           | only/just
+enn            | than
+fordi          | as/because
+før            | before
+mange          | many
+også           | also
+slik           | just
+vært           | been
+være           | to be
+båe            | both *
+begge          | both
+siden          | since
+dykk           | your *
+dykkar         | yours *
+dei            | they *
+deira          | them *
+deires         | theirs *
+deim           | them *
+di             | your (fem.) *
+då             | as/when *
+eg             | I *
+ein            | a/an *
+eit            | a/an *
+eitt           | a/an *
+elles          | or *
+honom          | he *
+hjå            | at *
+ho             | she *
+hoe            | she *
+henne          | her
+hennar         | her/hers
+hennes         | hers
+hoss           | how *
+hossen         | how *
+ikkje          | not *
+ingi           | noone *
+inkje          | noone *
+korleis        | how *
+korso          | how *
+kva            | what/which *
+kvar           | where *
+kvarhelst      | where *
+kven           | who/whom *
+kvi            | why *
+kvifor         | why *
+me             | we *
+medan          | while *
+mi             | my *
+mine           | my *
+mykje          | much *
+no             | now *
+nokon          | some (masc./neut.) *
+noka           | some (fem.) *
+nokor          | some *
+noko           | some *
+nokre          | some *
+si             | his/hers *
+sia            | since *
+sidan          | since *
+so             | so *
+somt           | some *
+somme          | some *
+um             | about*
+upp            | up *
+vere           | be *
+vore           | was *
+verte          | become *
+vort           | become *
+varte          | became *
+vart           | became *
+
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/portuguese_stop.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/portuguese_stop.txt
@ -0,0 +1,251 @@
+ | From svn.tartarus.org/snowball/trunk/website/algorithms/portuguese/stop.txt
+ | This file is distributed under the BSD License.
+ | See http://snowball.tartarus.org/license.php
+ | Also see http://www.opensource.org/licenses/bsd-license.html
+ |  - Encoding was converted to UTF-8.
+ |  - This notice was added.
+
+ | A Portuguese stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+
+
+ | The following is a ranked list (commonest to rarest) of stopwords
+ | deriving from a large sample of text.
+
+ | Extra words have been added at the end.
+
+de             |  of, from
+a              |  the; to, at; her
+o              |  the; him
+que            |  who, that
+e              |  and
+do             |  de + o
+da             |  de + a
+em             |  in
+um             |  a
+para           |  for
+  | é          from SER
+com            |  with
+não            |  not, no
+uma            |  a
+os             |  the; them
+no             |  em + o
+se             |  himself etc
+na             |  em + a
+por            |  for
+mais           |  more
+as             |  the; them
+dos            |  de + os
+como           |  as, like
+mas            |  but
+  | foi        from SER
+ao             |  a + o
+ele            |  he
+das            |  de + as
+  | tem        from TER
+à              |  a + a
+seu            |  his
+sua            |  her
+ou             |  or
+  | ser        from SER
+quando         |  when
+muito          |  much
+  | há         from HAV
+nos            |  em + os; us
+já             |  already, now
+  | está       from EST
+eu             |  I
+também         |  also
+só             |  only, just
+pelo           |  per + o
+pela           |  per + a
+até            |  up to
+isso           |  that
+ela            |  he
+entre          |  between
+  | era        from SER
+depois         |  after
+sem            |  without
+mesmo          |  same
+aos            |  a + os
+  | ter        from TER
+seus           |  his
+quem           |  whom
+nas            |  em + as
+me             |  me
+esse           |  that
+eles           |  they
+  | estão      from EST
+você           |  you
+  | tinha      from TER
+  | foram      from SER
+essa           |  that
+num            |  em + um
+nem            |  nor
+suas           |  her
+meu            |  my
+às             |  a + as
+minha          |  my
+  | têm        from TER
+numa           |  em + uma
+pelos          |  per + os
+elas           |  they
+  | havia      from HAV
+  | seja       from SER
+qual           |  which
+  | será       from SER
+nós            |  we
+  | tenho      from TER
+lhe            |  to him, her
+deles          |  of them
+essas          |  those
+esses          |  those
+pelas          |  per + as
+este           |  this
+  | fosse      from SER
+dele           |  of him
+
+ | other words. There are many contractions such as naquele = em+aquele,
+ | mo = me+o, but they are rare.
+ | Indefinite article plural forms are also rare.
+
+tu             |  thou
+te             |  thee
+vocês          |  you (plural)
+vos            |  you
+lhes           |  to them
+meus           |  my
+minhas
+teu            |  thy
+tua
+teus
+tuas
+nosso          | our
+nossa
+nossos
+nossas
+
+dela           |  of her
+delas          |  of them
+
+esta           |  this
+estes          |  these
+estas          |  these
+aquele         |  that
+aquela         |  that
+aqueles        |  those
+aquelas        |  those
+isto           |  this
+aquilo         |  that
+
+               | forms of estar, to be (not including the infinitive):
+estou
+está
+estamos
+estão
+estive
+esteve
+estivemos
+estiveram
+estava
+estávamos
+estavam
+estivera
+estivéramos
+esteja
+estejamos
+estejam
+estivesse
+estivéssemos
+estivessem
+estiver
+estivermos
+estiverem
+
+               | forms of haver, to have (not including the infinitive):
+hei
+há
+havemos
+hão
+houve
+houvemos
+houveram
+houvera
+houvéramos
+haja
+hajamos
+hajam
+houvesse
+houvéssemos
+houvessem
+houver
+houvermos
+houverem
+houverei
+houverá
+houveremos
+houverão
+houveria
+houveríamos
+houveriam
+
+               | forms of ser, to be (not including the infinitive):
+sou
+somos
+são
+era
+éramos
+eram
+fui
+foi
+fomos
+foram
+fora
+fôramos
+seja
+sejamos
+sejam
+fosse
+fôssemos
+fossem
+for
+formos
+forem
+serei
+será
+seremos
+serão
+seria
+seríamos
+seriam
+
+               | forms of ter, to have (not including the infinitive):
+tenho
+tem
+temos
+tém
+tinha
+tínhamos
+tinham
+tive
+teve
+tivemos
+tiveram
+tivera
+tivéramos
+tenha
+tenhamos
+tenham
+tivesse
+tivéssemos
+tivessem
+tiver
+tivermos
+tiverem
+terei
+terá
+teremos
+terão
+teria
+teríamos
+teriam
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/russian_stop.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/russian_stop.txt
@ -0,0 +1,241 @@
+ | From svn.tartarus.org/snowball/trunk/website/algorithms/russian/stop.txt
+ | This file is distributed under the BSD License.
+ | See http://snowball.tartarus.org/license.php
+ | Also see http://www.opensource.org/licenses/bsd-license.html
+ |  - Encoding was converted to UTF-8.
+ |  - This notice was added.
+
+ | a russian stop word list. comments begin with vertical bar. each stop
+ | word is at the start of a line.
+
+ | this is a ranked list (commonest to rarest) of stopwords derived from
+ | a large text sample.
+
+ | letter `ё' is translated to `е'.
+
+и              | and
+в              | in/into
+во             | alternative form
+не             | not
+что            | what/that
+он             | he
+на             | on/onto
+я              | i
+с              | from
+со             | alternative form
+как            | how
+а              | milder form of `no' (but)
+то             | conjunction and form of `that'
+все            | all
+она            | she
+так            | so, thus
+его            | him
+но             | but
+да             | yes/and
+ты             | thou
+к              | towards, by
+у              | around, chez
+же             | intensifier particle
+вы             | you
+за             | beyond, behind
+бы             | conditional/subj. particle
+по             | up to, along
+только         | only
+ее             | her
+мне            | to me
+было           | it was
+вот            | here is/are, particle
+от             | away from
+меня           | me
+еще            | still, yet, more
+нет            | no, there isnt/arent
+о              | about
+из             | out of
+ему            | to him
+теперь         | now
+когда          | when
+даже           | even
+ну             | so, well
+вдруг          | suddenly
+ли             | interrogative particle
+если           | if
+уже            | already, but homonym of `narrower'
+или            | or
+ни             | neither
+быть           | to be
+был            | he was
+него           | prepositional form of его
+до             | up to
+вас            | you accusative
+нибудь         | indef. suffix preceded by hyphen
+опять          | again
+уж             | already, but homonym of `adder'
+вам            | to you
+сказал         | he said
+ведь           | particle `after all'
+там            | there
+потом          | then
+себя           | oneself
+ничего         | nothing
+ей             | to her
+может          | usually with `быть' as `maybe'
+они            | they
+тут            | here
+где            | where
+есть           | there is/are
+надо           | got to, must
+ней            | prepositional form of  ей
+для            | for
+мы             | we
+тебя           | thee
+их             | them, their
+чем            | than
+была           | she was
+сам            | self
+чтоб           | in order to
+без            | without
+будто          | as if
+человек        | man, person, one
+чего           | genitive form of `what'
+раз            | once
+тоже           | also
+себе           | to oneself
+под            | beneath
+жизнь          | life
+будет          | will be
+ж              | short form of intensifer particle `же'
+тогда          | then
+кто            | who
+этот           | this
+говорил        | was saying
+того           | genitive form of `that'
+потому         | for that reason
+этого          | genitive form of `this'
+какой          | which
+совсем         | altogether
+ним            | prepositional form of `его', `они'
+здесь          | here
+этом           | prepositional form of `этот'
+один           | one
+почти          | almost
+мой            | my
+тем            | instrumental/dative plural of `тот', `то'
+чтобы          | full form of `in order that'
+нее            | her (acc.)
+кажется        | it seems
+сейчас         | now
+были           | they were
+куда           | where to
+зачем          | why
+сказать        | to say
+всех           | all (acc., gen. preposn. plural)
+никогда        | never
+сегодня        | today
+можно          | possible, one can
+при            | by
+наконец        | finally
+два            | two
+об             | alternative form of `о', about
+другой         | another
+хоть           | even
+после          | after
+над            | above
+больше         | more
+тот            | that one (masc.)
+через          | across, in
+эти            | these
+нас            | us
+про            | about
+всего          | in all, only, of all
+них            | prepositional form of `они' (they)
+какая          | which, feminine
+много          | lots
+разве          | interrogative particle
+сказала        | she said
+три            | three
+эту            | this, acc. fem. sing.
+моя            | my, feminine
+впрочем        | moreover, besides
+хорошо         | good
+свою           | ones own, acc. fem. sing.
+этой           | oblique form of `эта', fem. `this'
+перед          | in front of
+иногда         | sometimes
+лучше          | better
+чуть           | a little
+том            | preposn. form of `that one'
+нельзя         | one must not
+такой          | such a one
+им             | to them
+более          | more
+всегда         | always
+конечно        | of course
+всю            | acc. fem. sing of `all'
+между          | between
+
+
+  | b: some paradigms
+  |
+  | personal pronouns
+  |
+  | я  меня  мне  мной  [мною]
+  | ты  тебя  тебе  тобой  [тобою]
+  | он  его  ему  им  [него, нему, ним]
+  | она  ее  эи  ею  [нее, нэи, нею]
+  | оно  его  ему  им  [него, нему, ним]
+  |
+  | мы  нас  нам  нами
+  | вы  вас  вам  вами
+  | они  их  им  ими  [них, ним, ними]
+  |
+  |   себя  себе  собой   [собою]
+  |
+  | demonstrative pronouns: этот (this), тот (that)
+  |
+  | этот  эта  это  эти
+  | этого  эты  это  эти
+  | этого  этой  этого  этих
+  | этому  этой  этому  этим
+  | этим  этой  этим  [этою]  этими
+  | этом  этой  этом  этих
+  |
+  | тот  та  то  те
+  | того  ту  то  те
+  | того  той  того  тех
+  | тому  той  тому  тем
+  | тем  той  тем  [тою]  теми
+  | том  той  том  тех
+  |
+  | determinative pronouns
+  |
+  | (a) весь (all)
+  |
+  | весь  вся  все  все
+  | всего  всю  все  все
+  | всего  всей  всего  всех
+  | всему  всей  всему  всем
+  | всем  всей  всем  [всею]  всеми
+  | всем  всей  всем  всех
+  |
+  | (b) сам (himself etc)
+  |
+  | сам  сама  само  сами
+  | самого саму  само  самих
+  | самого самой самого  самих
+  | самому самой самому  самим
+  | самим  самой  самим  [самою]  самими
+  | самом самой самом  самих
+  |
+  | stems of verbs `to be', `to have', `to do' and modal
+  |
+  | быть  бы  буд  быв  есть  суть
+  | име
+  | дел
+  | мог   мож  мочь
+  | уме
+  | хоч  хот
+  | долж
+  | можн
+  | нужн
+  | нельзя
+
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/spanish_stop.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/spanish_stop.txt
@ -0,0 +1,354 @@
+ | From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt
+ | This file is distributed under the BSD License.
+ | See http://snowball.tartarus.org/license.php
+ | Also see http://www.opensource.org/licenses/bsd-license.html
+ |  - Encoding was converted to UTF-8.
+ |  - This notice was added.
+
+ | A Spanish stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+
+
+ | The following is a ranked list (commonest to rarest) of stopwords
+ | deriving from a large sample of text.
+
+ | Extra words have been added at the end.
+
+de             |  from, of
+la             |  the, her
+que            |  who, that
+el             |  the
+en             |  in
+y              |  and
+a              |  to
+los            |  the, them
+del            |  de + el
+se             |  himself, from him etc
+las            |  the, them
+por            |  for, by, etc
+un             |  a
+para           |  for
+con            |  with
+no             |  no
+una            |  a
+su             |  his, her
+al             |  a + el
+  | es         from SER
+lo             |  him
+como           |  how
+más            |  more
+pero           |  pero
+sus            |  su plural
+le             |  to him, her
+ya             |  already
+o              |  or
+  | fue        from SER
+este           |  this
+  | ha         from HABER
+sí             |  himself etc
+porque         |  because
+esta           |  this
+  | son        from SER
+entre          |  between
+  | está     from ESTAR
+cuando         |  when
+muy            |  very
+sin            |  without
+sobre          |  on
+  | ser        from SER
+  | tiene      from TENER
+también        |  also
+me             |  me
+hasta          |  until
+hay            |  there is/are
+donde          |  where
+  | han        from HABER
+quien          |  whom, that
+  | están      from ESTAR
+  | estado     from ESTAR
+desde          |  from
+todo           |  all
+nos            |  us
+durante        |  during
+  | estados    from ESTAR
+todos          |  all
+uno            |  a
+les            |  to them
+ni             |  nor
+contra         |  against
+otros          |  other
+  | fueron     from SER
+ese            |  that
+eso            |  that
+  | había      from HABER
+ante           |  before
+ellos          |  they
+e              |  and (variant of y)
+esto           |  this
+mí             |  me
+antes          |  before
+algunos        |  some
+qué            |  what?
+unos           |  a
+yo             |  I
+otro           |  other
+otras          |  other
+otra           |  other
+él             |  he
+tanto          |  so much, many
+esa            |  that
+estos          |  these
+mucho          |  much, many
+quienes        |  who
+nada           |  nothing
+muchos         |  many
+cual           |  who
+  | sea        from SER
+poco           |  few
+ella           |  she
+estar          |  to be
+  | haber      from HABER
+estas          |  these
+  | estaba     from ESTAR
+  | estamos    from ESTAR
+algunas        |  some
+algo           |  something
+nosotros       |  we
+
+      | other forms
+
+mi             |  me
+mis            |  mi plural
+tú             |  thou
+te             |  thee
+ti             |  thee
+tu             |  thy
+tus            |  tu plural
+ellas          |  they
+nosotras       |  we
+vosotros       |  you
+vosotras       |  you
+os             |  you
+mío            |  mine
+mía            |
+míos           |
+mías           |
+tuyo           |  thine
+tuya           |
+tuyos          |
+tuyas          |
+suyo           |  his, hers, theirs
+suya           |
+suyos          |
+suyas          |
+nuestro        |  ours
+nuestra        |
+nuestros       |
+nuestras       |
+vuestro        |  yours
+vuestra        |
+vuestros       |
+vuestras       |
+esos           |  those
+esas           |  those
+
+               | forms of estar, to be (not including the infinitive):
+estoy
+estás
+está
+estamos
+estáis
+están
+esté
+estés
+estemos
+estéis
+estén
+estaré
+estarás
+estará
+estaremos
+estaréis
+estarán
+estaría
+estarías
+estaríamos
+estaríais
+estarían
+estaba
+estabas
+estábamos
+estabais
+estaban
+estuve
+estuviste
+estuvo
+estuvimos
+estuvisteis
+estuvieron
+estuviera
+estuvieras
+estuviéramos
+estuvierais
+estuvieran
+estuviese
+estuvieses
+estuviésemos
+estuvieseis
+estuviesen
+estando
+estado
+estada
+estados
+estadas
+estad
+
+               | forms of haber, to have (not including the infinitive):
+he
+has
+ha
+hemos
+habéis
+han
+haya
+hayas
+hayamos
+hayáis
+hayan
+habré
+habrás
+habrá
+habremos
+habréis
+habrán
+habría
+habrías
+habríamos
+habríais
+habrían
+había
+habías
+habíamos
+habíais
+habían
+hube
+hubiste
+hubo
+hubimos
+hubisteis
+hubieron
+hubiera
+hubieras
+hubiéramos
+hubierais
+hubieran
+hubiese
+hubieses
+hubiésemos
+hubieseis
+hubiesen
+habiendo
+habido
+habida
+habidos
+habidas
+
+               | forms of ser, to be (not including the infinitive):
+soy
+eres
+es
+somos
+sois
+son
+sea
+seas
+seamos
+seáis
+sean
+seré
+serás
+será
+seremos
+seréis
+serán
+sería
+serías
+seríamos
+seríais
+serían
+era
+eras
+éramos
+erais
+eran
+fui
+fuiste
+fue
+fuimos
+fuisteis
+fueron
+fuera
+fueras
+fuéramos
+fuerais
+fueran
+fuese
+fueses
+fuésemos
+fueseis
+fuesen
+siendo
+sido
+  |  sed also means 'thirst'
+
+               | forms of tener, to have (not including the infinitive):
+tengo
+tienes
+tiene
+tenemos
+tenéis
+tienen
+tenga
+tengas
+tengamos
+tengáis
+tengan
+tendré
+tendrás
+tendrá
+tendremos
+tendréis
+tendrán
+tendría
+tendrías
+tendríamos
+tendríais
+tendrían
+tenía
+tenías
+teníamos
+teníais
+tenían
+tuve
+tuviste
+tuvo
+tuvimos
+tuvisteis
+tuvieron
+tuviera
+tuvieras
+tuviéramos
+tuvierais
+tuvieran
+tuviese
+tuvieses
+tuviésemos
+tuvieseis
+tuviesen
+teniendo
+tenido
+tenida
+tenidos
+tenidas
+tened
+
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/swedish_stop.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/snowball/swedish_stop.txt
@ -0,0 +1,131 @@
+ | From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt
+ | This file is distributed under the BSD License.
+ | See http://snowball.tartarus.org/license.php
+ | Also see http://www.opensource.org/licenses/bsd-license.html
+ |  - Encoding was converted to UTF-8.
+ |  - This notice was added.
+
+ | A Swedish stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+
+ | This is a ranked list (commonest to rarest) of stopwords derived from
+ | a large text sample.
+
+ | Swedish stop words occasionally exhibit homonym clashes. For example
+ |  så = so, but also seed. These are indicated clearly below.
+
+och            | and
+det            | it, this/that
+att            | to (with infinitive)
+i              | in, at
+en             | a
+jag            | I
+hon            | she
+som            | who, that
+han            | he
+på             | on
+den            | it, this/that
+med            | with
+var            | where, each
+sig            | him(self) etc
+för            | for
+så             | so (also: seed)
+till           | to
+är             | is
+men            | but
+ett            | a
+om             | if; around, about
+hade           | had
+de             | they, these/those
+av             | of
+icke           | not, no
+mig            | me
+du             | you
+henne          | her
+då             | then, when
+sin            | his
+nu             | now
+har            | have
+inte           | inte någon = no one
+hans           | his
+honom          | him
+skulle         | 'sake'
+hennes         | her
+där            | there
+min            | my
+man            | one (pronoun)
+ej             | nor
+vid            | at, by, on (also: vast)
+kunde          | could
+något          | some etc
+från           | from, off
+ut             | out
+när            | when
+efter          | after, behind
+upp            | up
+vi             | we
+dem            | them
+vara           | be
+vad            | what
+över           | over
+än             | than
+dig            | you
+kan            | can
+sina           | his
+här            | here
+ha             | have
+mot            | towards
+alla           | all
+under          | under (also: wonder)
+någon          | some etc
+eller          | or (else)
+allt           | all
+mycket         | much
+sedan          | since
+ju             | why
+denna          | this/that
+själv          | myself, yourself etc
+detta          | this/that
+åt             | to
+utan           | without
+varit          | was
+hur            | how
+ingen          | no
+mitt           | my
+ni             | you
+bli            | to be, become
+blev           | from bli
+oss            | us
+din            | thy
+dessa          | these/those
+några          | some etc
+deras          | their
+blir           | from bli
+mina           | my
+samma          | (the) same
+vilken         | who, that
+er             | you, your
+sådan          | such a
+vår            | our
+blivit         | from bli
+dess           | its
+inom           | within
+mellan         | between
+sådant         | such a
+varför         | why
+varje          | each
+vilka          | who, that
+ditt           | thy
+vem            | who
+vilket         | who, that
+sitta          | his
+sådana         | such a
+vart           | each
+dina           | thy
+vars           | whose
+vårt           | our
+våra           | our
+ert            | your
+era            | your
+vilkas         | whose
+
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/th/stopwords.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/th/stopwords.txt
@ -0,0 +1,119 @@
+# Thai stopwords from:
+# "Opinion Detection in Thai Political News Columns
+# Based on Subjectivity Analysis"
+# Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak
+ไว้
+ไม่
+ไป
+ได้
+ให้
+ใน
+โดย
+แห่ง
+แล้ว
+และ
+แรก
+แบบ
+แต่
+เอง
+เห็น
+เลย
+เริ่ม
+เรา
+เมื่อ
+เพื่อ
+เพราะ
+เป็นการ
+เป็น
+เปิดเผย
+เปิด
+เนื่องจาก
+เดียวกัน
+เดียว
+เช่น
+เฉพาะ
+เคย
+เข้า
+เขา
+อีก
+อาจ
+อะไร
+ออก
+อย่าง
+อยู่
+อยาก
+หาก
+หลาย
+หลังจาก
+หลัง
+หรือ
+หนึ่ง
+ส่วน
+ส่ง
+สุด
+สําหรับ
+ว่า
+วัน
+ลง
+ร่วม
+ราย
+รับ
+ระหว่าง
+รวม
+ยัง
+มี
+มาก
+มา
+พร้อม
+พบ
+ผ่าน
+ผล
+บาง
+น่า
+นี้
+นํา
+นั้น
+นัก
+นอกจาก
+ทุก
+ที่สุด
+ที่
+ทําให้
+ทํา
+ทาง
+ทั้งนี้
+ทั้ง
+ถ้า
+ถูก
+ถึง
+ต้อง
+ต่างๆ
+ต่าง
+ต่อ
+ตาม
+ตั้งแต่
+ตั้ง
+ด้าน
+ด้วย
+ดัง
+ซึ่ง
+ช่วง
+จึง
+จาก
+จัด
+จะ
+คือ
+ความ
+ครั้ง
+คง
+ขึ้น
+ของ
+ขอ
+ขณะ
+ก่อน
+ก็
+การ
+กับ
+กัน
+กว่า
+กล่าว
--- a/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/tr/stopwords.txt
+++ b/fine-lucene/resources/com/fr/third/org/apache/lucene/analysis/tr/stopwords.txt
@ -0,0 +1,212 @@
+# Turkish stopwords from LUCENE-559
+# merged with the list from "Information Retrieval on Turkish Texts"
+#   (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf)
+acaba
+altmış
+altı
+ama
+ancak
+arada
+aslında
+ayrıca
+bana
+bazı
+belki
+ben
+benden
+beni
+benim
+beri
+beş
+bile
+bin
+bir
+birçok
+biri
+birkaç
+birkez
+birşey
+birşeyi
+biz
+bize
+bizden
+bizi
+bizim
+böyle
+böylece
+bu
+buna
+bunda
+bundan
+bunlar
+bunları
+bunların
+bunu
+bunun
+burada
+çok
+çünkü
+da
+daha
+dahi
+de
+defa
+değil
+diğer
+diye
+doksan
+dokuz
+dolayı
+dolayısıyla
+dört
+edecek
+eden
+ederek
+edilecek
+ediliyor
+edilmesi
+ediyor
+eğer
+elli
+en
+etmesi
+etti
+ettiği
+ettiğini
+gibi
+göre
+halen
+hangi
+hatta
+hem
+henüz
+hep
+hepsi
+her
+herhangi
+herkesin
+hiç
+hiçbir
+için
+iki
+ile
+ilgili
+ise
+işte
+itibaren
+itibariyle
+kadar
+karşın
+katrilyon
+kendi
+kendilerine
+kendini
+kendisi
+kendisine
+kendisini
+kez
+ki
+kim
+kimden
+kime
+kimi
+kimse
+kırk
+milyar
+milyon
+mu
+mü
+mı
+nasıl
+ne
+neden
+nedenle
+nerde
+nerede
+nereye
+niye
+niçin
+o
+olan
+olarak
+oldu
+olduğu
+olduğunu
+olduklarını
+olmadı
+olmadığı
+olmak
+olması
+olmayan
+olmaz
+olsa
+olsun
+olup
+olur
+olursa
+oluyor
+on
+ona
+ondan
+onlar
+onlardan
+onları
+onların
+onu
+onun
+otuz
+oysa
+öyle
+pek
+rağmen
+sadece
+sanki
+sekiz
+seksen
+sen
+senden
+seni
+senin
+siz
+sizden
+sizi
+sizin
+şey
+şeyden
+şeyi
+şeyler
+şöyle
+şu
+şuna
+şunda
+şundan
+şunları
+şunu
+tarafından
+trilyon
+tüm
+üç
+üzere
+var
+vardı
+ve
+veya
+ya
+yani
+yapacak
+yapılan
+yapılması
+yapıyor
+yapmak
+yaptı
+yaptığı
+yaptığını
+yaptıkları
+yedi
+yerine
+yetmiş
+yine
+yirmi
+yoksa
+yüz
+zaten
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/LucenePackage.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/LucenePackage.java
@ -0,0 +1,29 @@
+package com.fr.third.org.apache.lucene;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Lucene's package information, including version. **/
+public final class LucenePackage {
+
+  private LucenePackage() {}                      // can't construct
+
+  /** Return Lucene's package, including version information. */
+  public static Package get() {
+    return LucenePackage.class.getPackage();
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/Analyzer.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/Analyzer.java
@ -0,0 +1,393 @@
+package com.fr.third.org.apache.lucene.analysis;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.fr.third.org.apache.lucene.store.AlreadyClosedException;
+import com.fr.third.org.apache.lucene.util.CloseableThreadLocal;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * An Analyzer builds TokenStreams, which analyze text.  It thus represents a
+ * policy for extracting index terms from text.
+ * <p>
+ * In order to define what analysis is done, subclasses must define their
+ * {@link TokenStreamComponents TokenStreamComponents} in {@link #createComponents(String, Reader)}.
+ * The components are then reused in each call to {@link #tokenStream(String, Reader)}.
+ * <p>
+ * Simple example:
+ * <pre class="prettyprint">
+ * Analyzer analyzer = new Analyzer() {
+ *  {@literal @Override}
+ *   protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ *     Tokenizer source = new FooTokenizer(reader);
+ *     TokenStream filter = new FooFilter(source);
+ *     filter = new BarFilter(filter);
+ *     return new TokenStreamComponents(source, filter);
+ *   }
+ * };
+ * </pre>
+ * For more examples, see the {@link com.fr.third.org.apache.lucene.analysis Analysis package documentation}.
+ * <p>
+ * For some concrete implementations bundled with Lucene, look in the analysis modules:
+ * <ul>
+ *   <li><a href="{@docRoot}/../analyzers-common/overview-summary.html">Common</a>:
+ *       Analyzers for indexing content in different languages and domains.
+ *   <li><a href="{@docRoot}/../analyzers-icu/overview-summary.html">ICU</a>:
+ *       Exposes functionality from ICU to Apache Lucene. 
+ *   <li><a href="{@docRoot}/../analyzers-kuromoji/overview-summary.html">Kuromoji</a>:
+ *       Morphological analyzer for Japanese text.
+ *   <li><a href="{@docRoot}/../analyzers-morfologik/overview-summary.html">Morfologik</a>:
+ *       Dictionary-driven lemmatization for the Polish language.
+ *   <li><a href="{@docRoot}/../analyzers-phonetic/overview-summary.html">Phonetic</a>:
+ *       Analysis for indexing phonetic signatures (for sounds-alike search).
+ *   <li><a href="{@docRoot}/../analyzers-smartcn/overview-summary.html">Smart Chinese</a>:
+ *       Analyzer for Simplified Chinese, which indexes words.
+ *   <li><a href="{@docRoot}/../analyzers-stempel/overview-summary.html">Stempel</a>:
+ *       Algorithmic Stemmer for the Polish Language.
+ *   <li><a href="{@docRoot}/../analyzers-uima/overview-summary.html">UIMA</a>: 
+ *       Analysis integration with Apache UIMA. 
+ * </ul>
+ */
+public abstract class Analyzer implements Closeable {
+
+  private final ReuseStrategy reuseStrategy;
+
+  /**
+   * Create a new Analyzer, reusing the same set of components per-thread
+   * across calls to {@link #tokenStream(String, Reader)}. 
+   */
+  public Analyzer() {
+    this(new GlobalReuseStrategy());
+  }
+
+  /**
+   * Expert: create a new Analyzer with a custom {@link ReuseStrategy}.
+   * <p>
+   * NOTE: if you just want to reuse on a per-field basis, its easier to
+   * use a subclass of {@link AnalyzerWrapper} such as 
+   * <a href="{@docRoot}/../analyzers-common/org/apache/lucene/analysis/miscellaneous/PerFieldAnalyzerWrapper.html">
+   * PerFieldAnalyerWrapper</a> instead.
+   */
+  public Analyzer(ReuseStrategy reuseStrategy) {
+    this.reuseStrategy = reuseStrategy;
+  }
+
+  /**
+   * Creates a new {@link TokenStreamComponents} instance for this analyzer.
+   * 
+   * @param fieldName
+   *          the name of the fields content passed to the
+   *          {@link TokenStreamComponents} sink as a reader
+   * @param reader
+   *          the reader passed to the {@link Tokenizer} constructor
+   * @return the {@link TokenStreamComponents} for this analyzer.
+   */
+  protected abstract TokenStreamComponents createComponents(String fieldName,
+      Reader reader);
+
+  /**
+   * Returns a TokenStream suitable for <code>fieldName</code>, tokenizing
+   * the contents of <code>reader</code>.
+   * <p>
+   * This method uses {@link #createComponents(String, Reader)} to obtain an
+   * instance of {@link TokenStreamComponents}. It returns the sink of the
+   * components and stores the components internally. Subsequent calls to this
+   * method will reuse the previously stored components after resetting them
+   * through {@link TokenStreamComponents#setReader(Reader)}.
+   * <p>
+   * <b>NOTE:</b> After calling this method, the consumer must follow the 
+   * workflow described in {@link TokenStream} to properly consume its contents.
+   * See the {@link com.fr.third.org.apache.lucene.analysis Analysis package documentation} for
+   * some examples demonstrating this.
+   * 
+   * @param fieldName the name of the field the created TokenStream is used for
+   * @param reader the reader the streams source reads from
+   * @return TokenStream for iterating the analyzed content of <code>reader</code>
+   * @throws AlreadyClosedException if the Analyzer is closed.
+   * @throws IOException if an i/o error occurs.
+   */
+  public final TokenStream tokenStream(final String fieldName,
+                                       final Reader reader) throws IOException {
+    TokenStreamComponents components = reuseStrategy.getReusableComponents(fieldName);
+    final Reader r = initReader(fieldName, reader);
+    if (components == null) {
+      components = createComponents(fieldName, r);
+      reuseStrategy.setReusableComponents(fieldName, components);
+    } else {
+      components.setReader(r);
+    }
+    return components.getTokenStream();
+  }
+  
+  /**
+   * Override this if you want to add a CharFilter chain.
+   * <p>
+   * The default implementation returns <code>reader</code>
+   * unchanged.
+   * 
+   * @param fieldName IndexableField name being indexed
+   * @param reader original Reader
+   * @return reader, optionally decorated with CharFilter(s)
+   */
+  protected Reader initReader(String fieldName, Reader reader) {
+    return reader;
+  }
+
+  /**
+   * Invoked before indexing a IndexableField instance if
+   * terms have already been added to that field.  This allows custom
+   * analyzers to place an automatic position increment gap between
+   * IndexbleField instances using the same field name.  The default value
+   * position increment gap is 0.  With a 0 position increment gap and
+   * the typical default token position increment of 1, all terms in a field,
+   * including across IndexableField instances, are in successive positions, allowing
+   * exact PhraseQuery matches, for instance, across IndexableField instance boundaries.
+   *
+   * @param fieldName IndexableField name being indexed.
+   * @return position increment gap, added to the next token emitted from {@link #tokenStream(String,Reader)}.
+   *         This value must be {@code >= 0}.
+   */
+  public int getPositionIncrementGap(String fieldName) {
+    return 0;
+  }
+
+  /**
+   * Just like {@link #getPositionIncrementGap}, except for
+   * Token offsets instead.  By default this returns 1.
+   * This method is only called if the field
+   * produced at least one token for indexing.
+   *
+   * @param fieldName the field just indexed
+   * @return offset gap, added to the next token emitted from {@link #tokenStream(String,Reader)}.
+   *         This value must be {@code >= 0}.
+   */
+  public int getOffsetGap(String fieldName) {
+    return 1;
+  }
+
+  /** Frees persistent resources used by this Analyzer */
+  public void close() {
+    reuseStrategy.close();
+  }
+
+  /**
+   * This class encapsulates the outer components of a token stream. It provides
+   * access to the source ({@link Tokenizer}) and the outer end (sink), an
+   * instance of {@link TokenFilter} which also serves as the
+   * {@link TokenStream} returned by
+   * {@link Analyzer#tokenStream(String, Reader)}.
+   */
+  public static class TokenStreamComponents {
+    /**
+     * Original source of the tokens.
+     */
+    protected final Tokenizer source;
+    /**
+     * Sink tokenstream, such as the outer tokenfilter decorating
+     * the chain. This can be the source if there are no filters.
+     */
+    protected final TokenStream sink;
+
+    /**
+     * Creates a new {@link TokenStreamComponents} instance.
+     * 
+     * @param source
+     *          the analyzer's tokenizer
+     * @param result
+     *          the analyzer's resulting token stream
+     */
+    public TokenStreamComponents(final Tokenizer source,
+        final TokenStream result) {
+      this.source = source;
+      this.sink = result;
+    }
+    
+    /**
+     * Creates a new {@link TokenStreamComponents} instance.
+     * 
+     * @param source
+     *          the analyzer's tokenizer
+     */
+    public TokenStreamComponents(final Tokenizer source) {
+      this.source = source;
+      this.sink = source;
+    }
+
+    /**
+     * Resets the encapsulated components with the given reader. If the components
+     * cannot be reset, an Exception should be thrown.
+     * 
+     * @param reader
+     *          a reader to reset the source component
+     * @throws IOException
+     *           if the component's reset method throws an {@link IOException}
+     */
+    protected void setReader(final Reader reader) throws IOException {
+      source.setReader(reader);
+    }
+
+    /**
+     * Returns the sink {@link TokenStream}
+     * 
+     * @return the sink {@link TokenStream}
+     */
+    public TokenStream getTokenStream() {
+      return sink;
+    }
+
+    /**
+     * Returns the component's {@link Tokenizer}
+     *
+     * @return Component's {@link Tokenizer}
+     */
+    public Tokenizer getTokenizer() {
+      return source;
+    }
+  }
+
+  /**
+   * Strategy defining how TokenStreamComponents are reused per call to
+   * {@link Analyzer#tokenStream(String, Reader)}.
+   */
+  public static abstract class ReuseStrategy implements Closeable {
+
+    private CloseableThreadLocal<Object> storedValue = new CloseableThreadLocal<Object>();
+
+    /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
+    public ReuseStrategy() {}
+
+    /**
+     * Gets the reusable TokenStreamComponents for the field with the given name
+     *
+     * @param fieldName Name of the field whose reusable TokenStreamComponents
+     *        are to be retrieved
+     * @return Reusable TokenStreamComponents for the field, or {@code null}
+     *         if there was no previous components for the field
+     */
+    public abstract TokenStreamComponents getReusableComponents(String fieldName);
+
+    /**
+     * Stores the given TokenStreamComponents as the reusable components for the
+     * field with the give name
+     *
+     * @param fieldName Name of the field whose TokenStreamComponents are being set
+     * @param components TokenStreamComponents which are to be reused for the field
+     */
+    public abstract void setReusableComponents(String fieldName, TokenStreamComponents components);
+
+    /**
+     * Returns the currently stored value
+     *
+     * @return Currently stored value or {@code null} if no value is stored
+     * @throws AlreadyClosedException if the ReuseStrategy is closed.
+     */
+    protected final Object getStoredValue() {
+      try {
+        return storedValue.get();
+      } catch (NullPointerException npe) {
+        if (storedValue == null) {
+          throw new AlreadyClosedException("this Analyzer is closed");
+        } else {
+          throw npe;
+        }
+      }
+    }
+
+    /**
+     * Sets the stored value
+     *
+     * @param storedValue Value to store
+     * @throws AlreadyClosedException if the ReuseStrategy is closed.
+     */
+    protected final void setStoredValue(Object storedValue) {
+      try {
+        this.storedValue.set(storedValue);
+      } catch (NullPointerException npe) {
+        if (storedValue == null) {
+          throw new AlreadyClosedException("this Analyzer is closed");
+        } else {
+          throw npe;
+        }
+      }
+    }
+
+    /**
+     * Closes the ReuseStrategy, freeing any resources
+     */
+    public void close() {
+      if (storedValue != null) {
+        storedValue.close();
+        storedValue = null;
+      }
+    }
+  }
+
+  /**
+   * Implementation of {@link ReuseStrategy} that reuses the same components for
+   * every field.
+   */
+  public final static class GlobalReuseStrategy extends ReuseStrategy {
+    
+    /** Creates a new instance, with empty per-thread values */
+    public GlobalReuseStrategy() {}
+
+    @Override
+    public TokenStreamComponents getReusableComponents(String fieldName) {
+      return (TokenStreamComponents) getStoredValue();
+    }
+
+    @Override
+    public void setReusableComponents(String fieldName, TokenStreamComponents components) {
+      setStoredValue(components);
+    }
+  }
+
+  /**
+   * Implementation of {@link ReuseStrategy} that reuses components per-field by
+   * maintaining a Map of TokenStreamComponent per field name.
+   */
+  public static class PerFieldReuseStrategy extends ReuseStrategy {
+
+    /** Creates a new instance, with empty per-thread-per-field values */
+    public PerFieldReuseStrategy() {}
+
+    @SuppressWarnings("unchecked")
+    @Override
+    public TokenStreamComponents getReusableComponents(String fieldName) {
+      Map<String, TokenStreamComponents> componentsPerField = (Map<String, TokenStreamComponents>) getStoredValue();
+      return componentsPerField != null ? componentsPerField.get(fieldName) : null;
+    }
+
+    @SuppressWarnings("unchecked")
+    @Override
+    public void setReusableComponents(String fieldName, TokenStreamComponents components) {
+      Map<String, TokenStreamComponents> componentsPerField = (Map<String, TokenStreamComponents>) getStoredValue();
+      if (componentsPerField == null) {
+        componentsPerField = new HashMap<String, TokenStreamComponents>();
+        setStoredValue(componentsPerField);
+      }
+      componentsPerField.put(fieldName, components);
+    }
+  }
+
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/AnalyzerWrapper.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/AnalyzerWrapper.java
@ -0,0 +1,83 @@
+package com.fr.third.org.apache.lucene.analysis;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+/**
+ * Extension to {@link Analyzer} suitable for Analyzers which wrap
+ * other Analyzers.
+ * <p/>
+ * {@link #getWrappedAnalyzer(String)} allows the Analyzer
+ * to wrap multiple Analyzers which are selected on a per field basis.
+ * <p/>
+ * {@link #wrapComponents(String, TokenStreamComponents)} allows the
+ * TokenStreamComponents of the wrapped Analyzer to then be wrapped
+ * (such as adding a new {@link TokenFilter} to form new TokenStreamComponents.
+ */
+public abstract class AnalyzerWrapper extends Analyzer {
+
+  /**
+   * Creates a new AnalyzerWrapper.  Since the {@link ReuseStrategy} of
+   * the wrapped Analyzers are unknown, {@link PerFieldReuseStrategy} is assumed
+   */
+  protected AnalyzerWrapper() {
+    super(new PerFieldReuseStrategy());
+  }
+
+  /**
+   * Retrieves the wrapped Analyzer appropriate for analyzing the field with
+   * the given name
+   *
+   * @param fieldName Name of the field which is to be analyzed
+   * @return Analyzer for the field with the given name.  Assumed to be non-null
+   */
+  protected abstract Analyzer getWrappedAnalyzer(String fieldName);
+
+  /**
+   * Wraps / alters the given TokenStreamComponents, taken from the wrapped
+   * Analyzer, to form new components.  It is through this method that new
+   * TokenFilters can be added by AnalyzerWrappers.
+   *
+   *
+   * @param fieldName Name of the field which is to be analyzed
+   * @param components TokenStreamComponents taken from the wrapped Analyzer
+   * @return Wrapped / altered TokenStreamComponents.
+   */
+  protected abstract TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components);
+
+  @Override
+  protected final TokenStreamComponents createComponents(String fieldName, Reader aReader) {
+    return wrapComponents(fieldName, getWrappedAnalyzer(fieldName).createComponents(fieldName, aReader));
+  }
+
+  @Override
+  public final int getPositionIncrementGap(String fieldName) {
+    return getWrappedAnalyzer(fieldName).getPositionIncrementGap(fieldName);
+  }
+
+  @Override
+  public final int getOffsetGap(String fieldName) {
+    return getWrappedAnalyzer(fieldName).getOffsetGap(fieldName);
+  }
+
+  @Override
+  public final Reader initReader(String fieldName, Reader reader) {
+    return getWrappedAnalyzer(fieldName).initReader(fieldName, reader);
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/CachingTokenFilter.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/CachingTokenFilter.java
@ -0,0 +1,98 @@
+package com.fr.third.org.apache.lucene.analysis;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+import com.fr.third.org.apache.lucene.util.AttributeSource;
+
+/**
+ * This class can be used if the token attributes of a TokenStream
+ * are intended to be consumed more than once. It caches
+ * all token attribute states locally in a List.
+ * 
+ * <P>CachingTokenFilter implements the optional method
+ * {@link TokenStream#reset()}, which repositions the
+ * stream to the first Token. 
+ */
+public final class CachingTokenFilter extends TokenFilter {
+  private List<State> cache = null;
+  private Iterator<State> iterator = null;
+  private State finalState;
+  
+  /**
+   * Create a new CachingTokenFilter around <code>input</code>,
+   * caching its token attributes, which can be replayed again
+   * after a call to {@link #reset()}.
+   */
+  public CachingTokenFilter(TokenStream input) {
+    super(input);
+  }
+  
+  @Override
+  public final boolean incrementToken() throws IOException {
+    if (cache == null) {
+      // fill cache lazily
+      cache = new LinkedList<State>();
+      fillCache();
+      iterator = cache.iterator();
+    }
+    
+    if (!iterator.hasNext()) {
+      // the cache is exhausted, return false
+      return false;
+    }
+    // Since the TokenFilter can be reset, the tokens need to be preserved as immutable.
+    restoreState(iterator.next());
+    return true;
+  }
+  
+  @Override
+  public final void end() {
+    if (finalState != null) {
+      restoreState(finalState);
+    }
+  }
+
+  /**
+   * Rewinds the iterator to the beginning of the cached list.
+   * <p>
+   * Note that this does not call reset() on the wrapped tokenstream ever, even
+   * the first time. You should reset() the inner tokenstream before wrapping
+   * it with CachingTokenFilter.
+   */
+  @Override
+  public void reset() {
+    if(cache != null) {
+      iterator = cache.iterator();
+    }
+  }
+  
+  private void fillCache() throws IOException {
+    while(input.incrementToken()) {
+      cache.add(captureState());
+    }
+    // capture final state
+    input.end();
+    finalState = captureState();
+  }
+
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/CharFilter.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/CharFilter.java
@ -0,0 +1,84 @@
+package com.fr.third.org.apache.lucene.analysis;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * Subclasses of CharFilter can be chained to filter a Reader
+ * They can be used as {@link Reader} with additional offset
+ * correction. {@link Tokenizer}s will automatically use {@link #correctOffset}
+ * if a CharFilter subclass is used.
+ * <p>
+ * This class is abstract: at a minimum you must implement {@link #read(char[], int, int)},
+ * transforming the input in some way from {@link #input}, and {@link #correct(int)}
+ * to adjust the offsets to match the originals.
+ * <p>
+ * You can optionally provide more efficient implementations of additional methods 
+ * like {@link #read()}, {@link #read(char[])}, {@link #read(java.nio.CharBuffer)},
+ * but this is not required.
+ * <p>
+ * For examples and integration with {@link Analyzer}, see the 
+ * {@link com.fr.third.org.apache.lucene.analysis Analysis package documentation}.
+ */
+// the way java.io.FilterReader should work!
+public abstract class CharFilter extends Reader {
+  /** 
+   * The underlying character-input stream. 
+   */
+  protected final Reader input;
+
+  /**
+   * Create a new CharFilter wrapping the provided reader.
+   * @param input a Reader, can also be a CharFilter for chaining.
+   */
+  public CharFilter(Reader input) {
+    super(input);
+    this.input = input;
+  }
+  
+  /** 
+   * Closes the underlying input stream.
+   * <p>
+   * <b>NOTE:</b> 
+   * The default implementation closes the input Reader, so
+   * be sure to call <code>super.close()</code> when overriding this method.
+   */
+  @Override
+  public void close() throws IOException {
+    input.close();
+  }
+
+  /**
+   * Subclasses override to correct the current offset.
+   *
+   * @param currentOff current offset
+   * @return corrected offset
+   */
+  protected abstract int correct(int currentOff);
+  
+  /**
+   * Chains the corrected offset through the input
+   * CharFilter(s).
+   */
+  public final int correctOffset(int currentOff) {
+    final int corrected = correct(currentOff);
+    return (input instanceof CharFilter) ? ((CharFilter) input).correctOffset(corrected) : corrected;
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/NumericTokenStream.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/NumericTokenStream.java
@ -0,0 +1,321 @@
+package com.fr.third.org.apache.lucene.analysis;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import com.fr.third.org.apache.lucene.document.DoubleField; // for javadocs
+import com.fr.third.org.apache.lucene.document.FloatField; // for javadocs
+import com.fr.third.org.apache.lucene.document.IntField; // for javadocs
+import com.fr.third.org.apache.lucene.document.LongField; // for javadocs
+import com.fr.third.org.apache.lucene.search.NumericRangeFilter; // for javadocs
+import com.fr.third.org.apache.lucene.search.NumericRangeQuery;
+import com.fr.third.org.apache.lucene.util.Attribute;
+import com.fr.third.org.apache.lucene.util.AttributeImpl;
+import com.fr.third.org.apache.lucene.util.AttributeReflector;
+import com.fr.third.org.apache.lucene.util.BytesRef;
+import com.fr.third.org.apache.lucene.util.NumericUtils;
+
+/**
+ * <b>Expert:</b> This class provides a {@link TokenStream}
+ * for indexing numeric values that can be used by {@link
+ * NumericRangeQuery} or {@link NumericRangeFilter}.
+ *
+ * <p>Note that for simple usage, {@link IntField}, {@link
+ * LongField}, {@link FloatField} or {@link DoubleField} is
+ * recommended.  These fields disable norms and
+ * term freqs, as they are not usually needed during
+ * searching.  If you need to change these settings, you
+ * should use this class.
+ *
+ * <p>Here's an example usage, for an <code>int</code> field:
+ *
+ * <pre class="prettyprint">
+ *  FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
+ *  fieldType.setOmitNorms(true);
+ *  fieldType.setIndexOptions(IndexOptions.DOCS_ONLY);
+ *  Field field = new Field(name, new NumericTokenStream(precisionStep).setIntValue(value), fieldType);
+ *  document.add(field);
+ * </pre>
+ *
+ * <p>For optimal performance, re-use the TokenStream and Field instance
+ * for more than one document:
+ *
+ * <pre class="prettyprint">
+ *  NumericTokenStream stream = new NumericTokenStream(precisionStep);
+ *  FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
+ *  fieldType.setOmitNorms(true);
+ *  fieldType.setIndexOptions(IndexOptions.DOCS_ONLY);
+ *  Field field = new Field(name, stream, fieldType);
+ *  Document document = new Document();
+ *  document.add(field);
+ *
+ *  for(all documents) {
+ *    stream.setIntValue(value)
+ *    writer.addDocument(document);
+ *  }
+ * </pre>
+ *
+ * <p>This stream is not intended to be used in analyzers;
+ * it's more for iterating the different precisions during
+ * indexing a specific numeric value.</p>
+
+ * <p><b>NOTE</b>: as token streams are only consumed once
+ * the document is added to the index, if you index more
+ * than one numeric field, use a separate <code>NumericTokenStream</code>
+ * instance for each.</p>
+ *
+ * <p>See {@link NumericRangeQuery} for more details on the
+ * <a
+ * href="../search/NumericRangeQuery.html#precisionStepDesc"><code>precisionStep</code></a>
+ * parameter as well as how numeric fields work under the hood.</p>
+ *
+ * @since 2.9
+ */
+public final class NumericTokenStream extends TokenStream {
+
+  /** The full precision token gets this token type assigned. */
+  public static final String TOKEN_TYPE_FULL_PREC  = "fullPrecNumeric";
+
+  /** The lower precision tokens gets this token type assigned. */
+  public static final String TOKEN_TYPE_LOWER_PREC = "lowerPrecNumeric";
+  
+  /** <b>Expert:</b> Use this attribute to get the details of the currently generated token.
+   * @lucene.experimental
+   * @since 4.0
+   */
+  public interface NumericTermAttribute extends Attribute {
+    /** Returns current shift value, undefined before first token */
+    int getShift();
+    /** Returns current token's raw value as {@code long} with all {@link #getShift} applied, undefined before first token */
+    long getRawValue();
+    /** Returns value size in bits (32 for {@code float}, {@code int}; 64 for {@code double}, {@code long}) */
+    int getValueSize();
+    
+    /** <em>Don't call this method!</em>
+      * @lucene.internal */
+    void init(long value, int valSize, int precisionStep, int shift);
+
+    /** <em>Don't call this method!</em>
+      * @lucene.internal */
+    void setShift(int shift);
+
+    /** <em>Don't call this method!</em>
+      * @lucene.internal */
+    int incShift();
+  }
+  
+  // just a wrapper to prevent adding CTA
+  private static final class NumericAttributeFactory extends AttributeFactory {
+    private final AttributeFactory delegate;
+
+    NumericAttributeFactory(AttributeFactory delegate) {
+      this.delegate = delegate;
+    }
+  
+    @Override
+    public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
+      if (CharTermAttribute.class.isAssignableFrom(attClass))
+        throw new IllegalArgumentException("NumericTokenStream does not support CharTermAttribute.");
+      return delegate.createAttributeInstance(attClass);
+    }
+  }
+
+  /** Implementation of {@link NumericTermAttribute}.
+   * @lucene.internal
+   * @since 4.0
+   */
+  public static final class NumericTermAttributeImpl extends AttributeImpl implements NumericTermAttribute,TermToBytesRefAttribute {
+    private long value = 0L;
+    private int valueSize = 0, shift = 0, precisionStep = 0;
+    private BytesRef bytes = new BytesRef();
+    
+    /** 
+     * Creates, but does not yet initialize this attribute instance
+     * @see #init(long, int, int, int)
+     */
+    public NumericTermAttributeImpl() {}
+
+    public BytesRef getBytesRef() {
+      return bytes;
+    }
+    
+    public int fillBytesRef() {
+      try {
+        assert valueSize == 64 || valueSize == 32;
+        return (valueSize == 64) ? 
+          NumericUtils.longToPrefixCoded(value, shift, bytes) :
+          NumericUtils.intToPrefixCoded((int) value, shift, bytes);
+      } catch (IllegalArgumentException iae) {
+        // return empty token before first or after last
+        bytes.length = 0;
+        return 0;
+      }
+    }
+
+    public int getShift() { return shift; }
+    public void setShift(int shift) { this.shift = shift; }
+    public int incShift() {
+      return (shift += precisionStep);
+    }
+
+    public long getRawValue() { return value  & ~((1L << shift) - 1L); }
+    public int getValueSize() { return valueSize; }
+
+    public void init(long value, int valueSize, int precisionStep, int shift) {
+      this.value = value;
+      this.valueSize = valueSize;
+      this.precisionStep = precisionStep;
+      this.shift = shift;
+    }
+
+    @Override
+    public void clear() {
+      // this attribute has no contents to clear!
+      // we keep it untouched as it's fully controlled by outer class.
+    }
+    
+    @Override
+    public void reflectWith(AttributeReflector reflector) {
+      fillBytesRef();
+      reflector.reflect(TermToBytesRefAttribute.class, "bytes", BytesRef.deepCopyOf(bytes));
+      reflector.reflect(NumericTermAttribute.class, "shift", shift);
+      reflector.reflect(NumericTermAttribute.class, "rawValue", getRawValue());
+      reflector.reflect(NumericTermAttribute.class, "valueSize", valueSize);
+    }
+  
+    @Override
+    public void copyTo(AttributeImpl target) {
+      final NumericTermAttribute a = (NumericTermAttribute) target;
+      a.init(value, valueSize, precisionStep, shift);
+    }
+  }
+  
+  /**
+   * Creates a token stream for numeric values using the default <code>precisionStep</code>
+   * {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). The stream is not yet initialized,
+   * before using set a value using the various set<em>???</em>Value() methods.
+   */
+  public NumericTokenStream() {
+    this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, NumericUtils.PRECISION_STEP_DEFAULT);
+  }
+  
+  /**
+   * Creates a token stream for numeric values with the specified
+   * <code>precisionStep</code>. The stream is not yet initialized,
+   * before using set a value using the various set<em>???</em>Value() methods.
+   */
+  public NumericTokenStream(final int precisionStep) {
+    this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, precisionStep);
+  }
+
+  /**
+   * Expert: Creates a token stream for numeric values with the specified
+   * <code>precisionStep</code> using the given
+   * {@link AttributeFactory}.
+   * The stream is not yet initialized,
+   * before using set a value using the various set<em>???</em>Value() methods.
+   */
+  public NumericTokenStream(AttributeFactory factory, final int precisionStep) {
+    super(new NumericAttributeFactory(factory));
+    if (precisionStep < 1)
+      throw new IllegalArgumentException("precisionStep must be >=1");
+    this.precisionStep = precisionStep;
+    numericAtt.setShift(-precisionStep);
+  }
+
+  /**
+   * Initializes the token stream with the supplied <code>long</code> value.
+   * @param value the value, for which this TokenStream should enumerate tokens.
+   * @return this instance, because of this you can use it the following way:
+   * <code>new Field(name, new NumericTokenStream(precisionStep).setLongValue(value))</code>
+   */
+  public NumericTokenStream setLongValue(final long value) {
+    numericAtt.init(value, valSize = 64, precisionStep, -precisionStep);
+    return this;
+  }
+  
+  /**
+   * Initializes the token stream with the supplied <code>int</code> value.
+   * @param value the value, for which this TokenStream should enumerate tokens.
+   * @return this instance, because of this you can use it the following way:
+   * <code>new Field(name, new NumericTokenStream(precisionStep).setIntValue(value))</code>
+   */
+  public NumericTokenStream setIntValue(final int value) {
+    numericAtt.init(value, valSize = 32, precisionStep, -precisionStep);
+    return this;
+  }
+  
+  /**
+   * Initializes the token stream with the supplied <code>double</code> value.
+   * @param value the value, for which this TokenStream should enumerate tokens.
+   * @return this instance, because of this you can use it the following way:
+   * <code>new Field(name, new NumericTokenStream(precisionStep).setDoubleValue(value))</code>
+   */
+  public NumericTokenStream setDoubleValue(final double value) {
+    numericAtt.init(NumericUtils.doubleToSortableLong(value), valSize = 64, precisionStep, -precisionStep);
+    return this;
+  }
+  
+  /**
+   * Initializes the token stream with the supplied <code>float</code> value.
+   * @param value the value, for which this TokenStream should enumerate tokens.
+   * @return this instance, because of this you can use it the following way:
+   * <code>new Field(name, new NumericTokenStream(precisionStep).setFloatValue(value))</code>
+   */
+  public NumericTokenStream setFloatValue(final float value) {
+    numericAtt.init(NumericUtils.floatToSortableInt(value), valSize = 32, precisionStep, -precisionStep);
+    return this;
+  }
+  
+  @Override
+  public void reset() {
+    if (valSize == 0)
+      throw new IllegalStateException("call set???Value() before usage");
+    numericAtt.setShift(-precisionStep);
+  }
+
+  @Override
+  public boolean incrementToken() {
+    if (valSize == 0)
+      throw new IllegalStateException("call set???Value() before usage");
+    
+    // this will only clear all other attributes in this TokenStream
+    clearAttributes();
+
+    final int shift = numericAtt.incShift();
+    typeAtt.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC : TOKEN_TYPE_LOWER_PREC);
+    posIncrAtt.setPositionIncrement((shift == 0) ? 1 : 0);
+    return (shift < valSize);
+  }
+
+  /** Returns the precision step. */
+  public int getPrecisionStep() {
+    return precisionStep;
+  }
+  
+  // members
+  private final NumericTermAttribute numericAtt = addAttribute(NumericTermAttribute.class);
+  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+  
+  private int valSize = 0; // valSize==0 means not initialized
+  private final int precisionStep;
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/Token.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/Token.java
@ -0,0 +1,651 @@
+package com.fr.third.org.apache.lucene.analysis;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import com.fr.third.org.apache.lucene.index.DocsAndPositionsEnum; // for javadoc
+import com.fr.third.org.apache.lucene.util.Attribute;
+import com.fr.third.org.apache.lucene.util.AttributeSource;
+import com.fr.third.org.apache.lucene.util.AttributeImpl;
+import com.fr.third.org.apache.lucene.util.AttributeReflector;
+import com.fr.third.org.apache.lucene.util.BytesRef;
+
+/** 
+  A Token is an occurrence of a term from the text of a field.  It consists of
+  a term's text, the start and end offset of the term in the text of the field,
+  and a type string.
+  <p>
+  The start and end offsets permit applications to re-associate a token with
+  its source text, e.g., to display highlighted query terms in a document
+  browser, or to show matching text fragments in a <abbr title="KeyWord In Context">KWIC</abbr>
+  display, etc.
+  <p>
+  The type is a string, assigned by a lexical analyzer
+  (a.k.a. tokenizer), naming the lexical or syntactic class that the token
+  belongs to.  For example an end of sentence marker token might be implemented
+  with type "eos".  The default token type is "word".  
+  <p>
+  A Token can optionally have metadata (a.k.a. payload) in the form of a variable
+  length byte array. Use {@link DocsAndPositionsEnum#getPayload()} to retrieve the 
+  payloads from the index.
+  
+  <br><br>
+  
+  <p><b>NOTE:</b> As of 2.9, Token implements all {@link Attribute} interfaces
+  that are part of core Lucene and can be found in the {@code tokenattributes} subpackage.
+  Even though it is not necessary to use Token anymore, with the new TokenStream API it can
+  be used as convenience class that implements all {@link Attribute}s, which is especially useful
+  to easily switch from the old to the new TokenStream API.
+  
+  <br><br>
+  
+  <p>Tokenizers and TokenFilters should try to re-use a Token
+  instance when possible for best performance, by
+  implementing the {@link TokenStream#incrementToken()} API.
+  Failing that, to create a new Token you should first use
+  one of the constructors that starts with null text.  To load
+  the token from a char[] use {@link #copyBuffer(char[], int, int)}.
+  To load from a String use {@link #setEmpty} followed by {@link #append(CharSequence)} or {@link #append(CharSequence, int, int)}.
+  Alternatively you can get the Token's termBuffer by calling either {@link #buffer()},
+  if you know that your text is shorter than the capacity of the termBuffer
+  or {@link #resizeBuffer(int)}, if there is any possibility
+  that you may need to grow the buffer. Fill in the characters of your term into this
+  buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string,
+  or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setLength(int)} to
+  set the length of the term text.  See <a target="_top"
+  href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a>
+  for details.</p>
+  <p>Typical Token reuse patterns:
+  <ul>
+  <li> Copying text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
+  <pre class="prettyprint">
+    return reusableToken.reinit(string, startOffset, endOffset[, type]);
+  </pre>
+  </li>
+  <li> Copying some text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
+  <pre class="prettyprint">
+    return reusableToken.reinit(string, 0, string.length(), startOffset, endOffset[, type]);
+  </pre>
+  </li>
+  </li>
+  <li> Copying text from char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
+  <pre class="prettyprint">
+    return reusableToken.reinit(buffer, 0, buffer.length, startOffset, endOffset[, type]);
+  </pre>
+  </li>
+  <li> Copying some text from a char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
+  <pre class="prettyprint">
+    return reusableToken.reinit(buffer, start, end - start, startOffset, endOffset[, type]);
+  </pre>
+  </li>
+  <li> Copying from one one Token to another (type is reset to {@link #DEFAULT_TYPE} if not specified):<br/>
+  <pre class="prettyprint">
+    return reusableToken.reinit(source.buffer(), 0, source.length(), source.startOffset(), source.endOffset()[, source.type()]);
+  </pre>
+  </li>
+  </ul>
+  A few things to note:
+  <ul>
+  <li>clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but should affect no one.</li>
+  <li>Because <code>TokenStreams</code> can be chained, one cannot assume that the <code>Token's</code> current type is correct.</li>
+  <li>The startOffset and endOffset represent the start and offset in the source text, so be careful in adjusting them.</li>
+  <li>When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.</li>
+  </ul>
+  </p>
+  <p>
+  <b>Please note:</b> With Lucene 3.1, the <code>{@linkplain #toString toString()}</code> method had to be changed to match the
+  {@link CharSequence} interface introduced by the interface {@link com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute}.
+  This method now only prints the term text, no additional information anymore.
+  </p>
+*/
+public class Token extends CharTermAttributeImpl 
+                   implements TypeAttribute, PositionIncrementAttribute,
+                              FlagsAttribute, OffsetAttribute, PayloadAttribute, PositionLengthAttribute {
+
+  private int startOffset,endOffset;
+  private String type = DEFAULT_TYPE;
+  private int flags;
+  private BytesRef payload;
+  private int positionIncrement = 1;
+  private int positionLength = 1;
+
+  /** Constructs a Token will null text. */
+  public Token() {
+  }
+
+  /** Constructs a Token with null text and start & end
+   *  offsets.
+   *  @param start start offset in the source text
+   *  @param end end offset in the source text */
+  public Token(int start, int end) {
+    checkOffsets(start, end);
+    startOffset = start;
+    endOffset = end;
+  }
+
+  /** Constructs a Token with null text and start & end
+   *  offsets plus the Token type.
+   *  @param start start offset in the source text
+   *  @param end end offset in the source text
+   *  @param typ the lexical type of this Token */
+  public Token(int start, int end, String typ) {
+    checkOffsets(start, end);
+    startOffset = start;
+    endOffset = end;
+    type = typ;
+  }
+
+  /**
+   * Constructs a Token with null text and start & end
+   *  offsets plus flags. NOTE: flags is EXPERIMENTAL.
+   *  @param start start offset in the source text
+   *  @param end end offset in the source text
+   *  @param flags The bits to set for this token
+   */
+  public Token(int start, int end, int flags) {
+    checkOffsets(start, end);
+    startOffset = start;
+    endOffset = end;
+    this.flags = flags;
+  }
+
+  /** Constructs a Token with the given term text, and start
+   *  & end offsets.  The type defaults to "word."
+   *  <b>NOTE:</b> for better indexing speed you should
+   *  instead use the char[] termBuffer methods to set the
+   *  term text.
+   *  @param text term text
+   *  @param start start offset in the source text
+   *  @param end end offset in the source text
+   */
+  public Token(String text, int start, int end) {
+    checkOffsets(start, end);
+    append(text);
+    startOffset = start;
+    endOffset = end;
+  }
+
+  /** Constructs a Token with the given text, start and end
+   *  offsets, & type.  <b>NOTE:</b> for better indexing
+   *  speed you should instead use the char[] termBuffer
+   *  methods to set the term text.
+   *  @param text term text
+   *  @param start start offset in the source text
+   *  @param end end offset in the source text
+   *  @param typ token type
+   */
+  public Token(String text, int start, int end, String typ) {
+    checkOffsets(start, end);
+    append(text);
+    startOffset = start;
+    endOffset = end;
+    type = typ;
+  }
+
+  /**
+   *  Constructs a Token with the given text, start and end
+   *  offsets, & type.  <b>NOTE:</b> for better indexing
+   *  speed you should instead use the char[] termBuffer
+   *  methods to set the term text.
+   * @param text term text
+   * @param start start offset in the source text
+   * @param end end offset in the source text
+   * @param flags token type bits
+   */
+  public Token(String text, int start, int end, int flags) {
+    checkOffsets(start, end);
+    append(text);
+    startOffset = start;
+    endOffset = end;
+    this.flags = flags;
+  }
+
+  /**
+   *  Constructs a Token with the given term buffer (offset
+   *  & length), start and end
+   *  offsets
+   * @param startTermBuffer buffer containing term text
+   * @param termBufferOffset the index in the buffer of the first character
+   * @param termBufferLength number of valid characters in the buffer
+   * @param start start offset in the source text
+   * @param end end offset in the source text
+   */
+  public Token(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) {
+    checkOffsets(start, end);
+    copyBuffer(startTermBuffer, termBufferOffset, termBufferLength);
+    startOffset = start;
+    endOffset = end;
+  }
+
+  /**
+   * {@inheritDoc}
+   * @see PositionIncrementAttribute
+   */
+  public void setPositionIncrement(int positionIncrement) {
+    if (positionIncrement < 0)
+      throw new IllegalArgumentException
+        ("Increment must be zero or greater: " + positionIncrement);
+    this.positionIncrement = positionIncrement;
+  }
+
+  /**
+   * {@inheritDoc}
+   * @see PositionIncrementAttribute
+   */
+  public int getPositionIncrement() {
+    return positionIncrement;
+  }
+
+  /**
+   * {@inheritDoc}
+   * @see PositionLengthAttribute
+   */
+  @Override
+  public void setPositionLength(int positionLength) {
+    this.positionLength = positionLength;
+  }
+
+  /**
+   * {@inheritDoc}
+   * @see PositionLengthAttribute
+   */
+  @Override
+  public int getPositionLength() {
+    return positionLength;
+  }
+
+  /**
+   * {@inheritDoc}
+   * @see OffsetAttribute
+   */
+  public final int startOffset() {
+    return startOffset;
+  }
+
+  /**
+   * {@inheritDoc}
+   * @see OffsetAttribute
+   */
+  public final int endOffset() {
+    return endOffset;
+  }
+
+  /**
+   * {@inheritDoc}
+   * @see OffsetAttribute
+   */
+  public void setOffset(int startOffset, int endOffset) {
+    checkOffsets(startOffset, endOffset);
+    this.startOffset = startOffset;
+    this.endOffset = endOffset;
+  }
+
+  /**
+   * {@inheritDoc}
+   * @see TypeAttribute
+   */
+  public final String type() {
+    return type;
+  }
+
+  /**
+   * {@inheritDoc}
+   * @see TypeAttribute
+   */
+  public final void setType(String type) {
+    this.type = type;
+  }
+
+  /**
+   * {@inheritDoc}
+   * @see FlagsAttribute
+   */
+  public int getFlags() {
+    return flags;
+  }
+
+  /**
+   * {@inheritDoc}
+   * @see FlagsAttribute
+   */
+  public void setFlags(int flags) {
+    this.flags = flags;
+  }
+
+  /**
+   * {@inheritDoc}
+   * @see PayloadAttribute
+   */
+  public BytesRef getPayload() {
+    return this.payload;
+  }
+
+  /**
+   * {@inheritDoc}
+   * @see PayloadAttribute
+   */
+  public void setPayload(BytesRef payload) {
+    this.payload = payload;
+  }
+  
+  /** Resets the term text, payload, flags, and positionIncrement,
+   * startOffset, endOffset and token type to default.
+   */
+  @Override
+  public void clear() {
+    super.clear();
+    payload = null;
+    positionIncrement = 1;
+    flags = 0;
+    startOffset = endOffset = 0;
+    type = DEFAULT_TYPE;
+  }
+
+  @Override
+  public Token clone() {
+    Token t = (Token)super.clone();
+    // Do a deep clone
+    if (payload != null) {
+      t.payload = payload.clone();
+    }
+    return t;
+  }
+
+  /** Makes a clone, but replaces the term buffer &
+   * start/end offset in the process.  This is more
+   * efficient than doing a full clone (and then calling
+   * {@link #copyBuffer}) because it saves a wasted copy of the old
+   * termBuffer. */
+  public Token clone(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
+    final Token t = new Token(newTermBuffer, newTermOffset, newTermLength, newStartOffset, newEndOffset);
+    t.positionIncrement = positionIncrement;
+    t.flags = flags;
+    t.type = type;
+    if (payload != null)
+      t.payload = payload.clone();
+    return t;
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (obj == this)
+      return true;
+
+    if (obj instanceof Token) {
+      final Token other = (Token) obj;
+      return (startOffset == other.startOffset &&
+          endOffset == other.endOffset && 
+          flags == other.flags &&
+          positionIncrement == other.positionIncrement &&
+          (type == null ? other.type == null : type.equals(other.type)) &&
+          (payload == null ? other.payload == null : payload.equals(other.payload)) &&
+          super.equals(obj)
+      );
+    } else
+      return false;
+  }
+
+  @Override
+  public int hashCode() {
+    int code = super.hashCode();
+    code = code * 31 + startOffset;
+    code = code * 31 + endOffset;
+    code = code * 31 + flags;
+    code = code * 31 + positionIncrement;
+    if (type != null)
+      code = code * 31 + type.hashCode();
+    if (payload != null)
+      code = code * 31 + payload.hashCode();
+    return code;
+  }
+      
+  // like clear() but doesn't clear termBuffer/text
+  private void clearNoTermBuffer() {
+    payload = null;
+    positionIncrement = 1;
+    flags = 0;
+    startOffset = endOffset = 0;
+    type = DEFAULT_TYPE;
+  }
+
+  /** Shorthand for calling {@link #clear},
+   *  {@link #copyBuffer(char[], int, int)},
+   *  {@link #setOffset},
+   *  {@link #setType}
+   *  @return this Token instance */
+  public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
+    checkOffsets(newStartOffset, newEndOffset);
+    clearNoTermBuffer();
+    copyBuffer(newTermBuffer, newTermOffset, newTermLength);
+    payload = null;
+    positionIncrement = 1;
+    startOffset = newStartOffset;
+    endOffset = newEndOffset;
+    type = newType;
+    return this;
+  }
+
+  /** Shorthand for calling {@link #clear},
+   *  {@link #copyBuffer(char[], int, int)},
+   *  {@link #setOffset},
+   *  {@link #setType} on Token.DEFAULT_TYPE
+   *  @return this Token instance */
+  public Token reinit(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
+    checkOffsets(newStartOffset, newEndOffset);
+    clearNoTermBuffer();
+    copyBuffer(newTermBuffer, newTermOffset, newTermLength);
+    startOffset = newStartOffset;
+    endOffset = newEndOffset;
+    type = DEFAULT_TYPE;
+    return this;
+  }
+
+  /** Shorthand for calling {@link #clear},
+   *  {@link #append(CharSequence)},
+   *  {@link #setOffset},
+   *  {@link #setType}
+   *  @return this Token instance */
+  public Token reinit(String newTerm, int newStartOffset, int newEndOffset, String newType) {
+    checkOffsets(newStartOffset, newEndOffset);
+    clear();
+    append(newTerm);
+    startOffset = newStartOffset;
+    endOffset = newEndOffset;
+    type = newType;
+    return this;
+  }
+
+  /** Shorthand for calling {@link #clear},
+   *  {@link #append(CharSequence, int, int)},
+   *  {@link #setOffset},
+   *  {@link #setType}
+   *  @return this Token instance */
+  public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset, String newType) {
+    checkOffsets(newStartOffset, newEndOffset);
+    clear();
+    append(newTerm, newTermOffset, newTermOffset + newTermLength);
+    startOffset = newStartOffset;
+    endOffset = newEndOffset;
+    type = newType;
+    return this;
+  }
+
+  /** Shorthand for calling {@link #clear},
+   *  {@link #append(CharSequence)},
+   *  {@link #setOffset},
+   *  {@link #setType} on Token.DEFAULT_TYPE
+   *  @return this Token instance */
+  public Token reinit(String newTerm, int newStartOffset, int newEndOffset) {
+    checkOffsets(newStartOffset, newEndOffset);
+    clear();
+    append(newTerm);
+    startOffset = newStartOffset;
+    endOffset = newEndOffset;
+    type = DEFAULT_TYPE;
+    return this;
+  }
+
+  /** Shorthand for calling {@link #clear},
+   *  {@link #append(CharSequence, int, int)},
+   *  {@link #setOffset},
+   *  {@link #setType} on Token.DEFAULT_TYPE
+   *  @return this Token instance */
+  public Token reinit(String newTerm, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
+    checkOffsets(newStartOffset, newEndOffset);
+    clear();
+    append(newTerm, newTermOffset, newTermOffset + newTermLength);
+    startOffset = newStartOffset;
+    endOffset = newEndOffset;
+    type = DEFAULT_TYPE;
+    return this;
+  }
+
+  /**
+   * Copy the prototype token's fields into this one. Note: Payloads are shared.
+   * @param prototype source Token to copy fields from
+   */
+  public void reinit(Token prototype) {
+    copyBuffer(prototype.buffer(), 0, prototype.length());
+    positionIncrement = prototype.positionIncrement;
+    flags = prototype.flags;
+    startOffset = prototype.startOffset;
+    endOffset = prototype.endOffset;
+    type = prototype.type;
+    payload =  prototype.payload;
+  }
+
+  /**
+   * Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.
+   * @param prototype existing Token
+   * @param newTerm new term text
+   */
+  public void reinit(Token prototype, String newTerm) {
+    setEmpty().append(newTerm);
+    positionIncrement = prototype.positionIncrement;
+    flags = prototype.flags;
+    startOffset = prototype.startOffset;
+    endOffset = prototype.endOffset;
+    type = prototype.type;
+    payload =  prototype.payload;
+  }
+
+  /**
+   * Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.
+   * @param prototype existing Token
+   * @param newTermBuffer buffer containing new term text
+   * @param offset the index in the buffer of the first character
+   * @param length number of valid characters in the buffer
+   */
+  public void reinit(Token prototype, char[] newTermBuffer, int offset, int length) {
+    copyBuffer(newTermBuffer, offset, length);
+    positionIncrement = prototype.positionIncrement;
+    flags = prototype.flags;
+    startOffset = prototype.startOffset;
+    endOffset = prototype.endOffset;
+    type = prototype.type;
+    payload =  prototype.payload;
+  }
+
+  @Override
+  public void copyTo(AttributeImpl target) {
+    if (target instanceof Token) {
+      final Token to = (Token) target;
+      to.reinit(this);
+      // reinit shares the payload, so clone it:
+      if (payload !=null) {
+        to.payload = payload.clone();
+      }
+    } else {
+      super.copyTo(target);
+      ((OffsetAttribute) target).setOffset(startOffset, endOffset);
+      ((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement);
+      ((PayloadAttribute) target).setPayload((payload == null) ? null : payload.clone());
+      ((FlagsAttribute) target).setFlags(flags);
+      ((TypeAttribute) target).setType(type);
+    }
+  }
+
+  @Override
+  public void reflectWith(AttributeReflector reflector) {
+    super.reflectWith(reflector);
+    reflector.reflect(OffsetAttribute.class, "startOffset", startOffset);
+    reflector.reflect(OffsetAttribute.class, "endOffset", endOffset);
+    reflector.reflect(PositionIncrementAttribute.class, "positionIncrement", positionIncrement);
+    reflector.reflect(PayloadAttribute.class, "payload", payload);
+    reflector.reflect(FlagsAttribute.class, "flags", flags);
+    reflector.reflect(TypeAttribute.class, "type", type);
+  }
+  
+  private void checkOffsets(int startOffset, int endOffset) {
+    if (startOffset < 0 || endOffset < startOffset) {
+      throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, "
+          + "startOffset=" + startOffset + ",endOffset=" + endOffset);
+    }
+  }
+
+  /** Convenience factory that returns <code>Token</code> as implementation for the basic
+   * attributes and return the default impl (with &quot;Impl&quot; appended) for all other
+   * attributes.
+   * @since 3.0
+   */
+  public static final AttributeSource.AttributeFactory TOKEN_ATTRIBUTE_FACTORY =
+    new TokenAttributeFactory(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
+  
+  /** <b>Expert:</b> Creates a TokenAttributeFactory returning {@link Token} as instance for the basic attributes
+   * and for all other attributes calls the given delegate factory.
+   * @since 3.0
+   */
+  public static final class TokenAttributeFactory extends AttributeSource.AttributeFactory {
+    
+    private final AttributeSource.AttributeFactory delegate;
+    
+    /** <b>Expert</b>: Creates an AttributeFactory returning {@link Token} as instance for the basic attributes
+     * and for all other attributes calls the given delegate factory. */
+    public TokenAttributeFactory(AttributeSource.AttributeFactory delegate) {
+      this.delegate = delegate;
+    }
+  
+    @Override
+    public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
+      return attClass.isAssignableFrom(Token.class)
+        ? new Token() : delegate.createAttributeInstance(attClass);
+    }
+    
+    @Override
+    public boolean equals(Object other) {
+      if (this == other) return true;
+      if (other instanceof TokenAttributeFactory) {
+        final TokenAttributeFactory af = (TokenAttributeFactory) other;
+        return this.delegate.equals(af.delegate);
+      }
+      return false;
+    }
+    
+    @Override
+    public int hashCode() {
+      return delegate.hashCode() ^ 0x0a45aa31;
+    }
+  }
+
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/TokenFilter.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/TokenFilter.java
@ -0,0 +1,72 @@
+package com.fr.third.org.apache.lucene.analysis;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+/** A TokenFilter is a TokenStream whose input is another TokenStream.
+  <p>
+  This is an abstract class; subclasses must override {@link #incrementToken()}.
+  @see TokenStream
+  */
+public abstract class TokenFilter extends TokenStream {
+  /** The source of tokens for this filter. */
+  protected final TokenStream input;
+
+  /** Construct a token stream filtering the given input. */
+  protected TokenFilter(TokenStream input) {
+    super(input);
+    this.input = input;
+  }
+  
+  /** 
+   * {@inheritDoc}
+   * <p> 
+   * <b>NOTE:</b> 
+   * The default implementation chains the call to the input TokenStream, so
+   * be sure to call <code>super.end()</code> first when overriding this method.
+   */
+  @Override
+  public void end() throws IOException {
+    input.end();
+  }
+  
+  /**
+   * {@inheritDoc}
+   * <p>
+   * <b>NOTE:</b> 
+   * The default implementation chains the call to the input TokenStream, so
+   * be sure to call <code>super.close()</code> when overriding this method.
+   */
+  @Override
+  public void close() throws IOException {
+    input.close();
+  }
+
+  /**
+   * {@inheritDoc}
+   * <p>
+   * <b>NOTE:</b> 
+   * The default implementation chains the call to the input TokenStream, so
+   * be sure to call <code>super.reset()</code> when overriding this method.
+   */
+  @Override
+  public void reset() throws IOException {
+    input.reset();
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/TokenStream.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/TokenStream.java
@ -0,0 +1,181 @@
+package com.fr.third.org.apache.lucene.analysis;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Closeable;
+import java.lang.reflect.Modifier;
+
+import com.fr.third.org.apache.lucene.document.Document;
+import com.fr.third.org.apache.lucene.document.Field;
+import com.fr.third.org.apache.lucene.index.IndexWriter;
+import com.fr.third.org.apache.lucene.util.Attribute;
+import com.fr.third.org.apache.lucene.util.AttributeImpl;
+import com.fr.third.org.apache.lucene.util.AttributeSource;
+
+/**
+ * A <code>TokenStream</code> enumerates the sequence of tokens, either from
+ * {@link Field}s of a {@link Document} or from query text.
+ * <p>
+ * This is an abstract class; concrete subclasses are:
+ * <ul>
+ * <li>{@link Tokenizer}, a <code>TokenStream</code> whose input is a Reader; and
+ * <li>{@link TokenFilter}, a <code>TokenStream</code> whose input is another
+ * <code>TokenStream</code>.
+ * </ul>
+ * A new <code>TokenStream</code> API has been introduced with Lucene 2.9. This API
+ * has moved from being {@link Token}-based to {@link Attribute}-based. While
+ * {@link Token} still exists in 2.9 as a convenience class, the preferred way
+ * to store the information of a {@link Token} is to use {@link AttributeImpl}s.
+ * <p>
+ * <code>TokenStream</code> now extends {@link AttributeSource}, which provides
+ * access to all of the token {@link Attribute}s for the <code>TokenStream</code>.
+ * Note that only one instance per {@link AttributeImpl} is created and reused
+ * for every token. This approach reduces object creation and allows local
+ * caching of references to the {@link AttributeImpl}s. See
+ * {@link #incrementToken()} for further details.
+ * <p>
+ * <b>The workflow of the new <code>TokenStream</code> API is as follows:</b>
+ * <ol>
+ * <li>Instantiation of <code>TokenStream</code>/{@link TokenFilter}s which add/get
+ * attributes to/from the {@link AttributeSource}.
+ * <li>The consumer calls {@link TokenStream#reset()}.
+ * <li>The consumer retrieves attributes from the stream and stores local
+ * references to all attributes it wants to access.
+ * <li>The consumer calls {@link #incrementToken()} until it returns false
+ * consuming the attributes after each call.
+ * <li>The consumer calls {@link #end()} so that any end-of-stream operations
+ * can be performed.
+ * <li>The consumer calls {@link #close()} to release any resource when finished
+ * using the <code>TokenStream</code>.
+ * </ol>
+ * To make sure that filters and consumers know which attributes are available,
+ * the attributes must be added during instantiation. Filters and consumers are
+ * not required to check for availability of attributes in
+ * {@link #incrementToken()}.
+ * <p>
+ * You can find some example code for the new API in the analysis package level
+ * Javadoc.
+ * <p>
+ * Sometimes it is desirable to capture a current state of a <code>TokenStream</code>,
+ * e.g., for buffering purposes (see {@link CachingTokenFilter},
+ * TeeSinkTokenFilter). For this usecase
+ * {@link AttributeSource#captureState} and {@link AttributeSource#restoreState}
+ * can be used.
+ * <p>The {@code TokenStream}-API in Lucene is based on the decorator pattern.
+ * Therefore all non-abstract subclasses must be final or have at least a final
+ * implementation of {@link #incrementToken}! This is checked when Java
+ * assertions are enabled.
+ */
+public abstract class TokenStream extends AttributeSource implements Closeable {
+
+  /**
+   * A TokenStream using the default attribute factory.
+   */
+  protected TokenStream() {
+    super();
+    assert assertFinal();
+  }
+  
+  /**
+   * A TokenStream that uses the same attributes as the supplied one.
+   */
+  protected TokenStream(AttributeSource input) {
+    super(input);
+    assert assertFinal();
+  }
+  
+  /**
+   * A TokenStream using the supplied AttributeFactory for creating new {@link Attribute} instances.
+   */
+  protected TokenStream(AttributeFactory factory) {
+    super(factory);
+    assert assertFinal();
+  }
+  
+  private boolean assertFinal() {
+    try {
+      final Class<?> clazz = getClass();
+      if (!clazz.desiredAssertionStatus())
+        return true;
+      assert clazz.isAnonymousClass() ||
+        (clazz.getModifiers() & (Modifier.FINAL | Modifier.PRIVATE)) != 0 ||
+        Modifier.isFinal(clazz.getMethod("incrementToken").getModifiers()) :
+        "TokenStream implementation classes or at least their incrementToken() implementation must be final";
+      return true;
+    } catch (NoSuchMethodException nsme) {
+      return false;
+    }
+  }
+  
+  /**
+   * Consumers (i.e., {@link IndexWriter}) use this method to advance the stream to
+   * the next token. Implementing classes must implement this method and update
+   * the appropriate {@link AttributeImpl}s with the attributes of the next
+   * token.
+   * <P>
+   * The producer must make no assumptions about the attributes after the method
+   * has been returned: the caller may arbitrarily change it. If the producer
+   * needs to preserve the state for subsequent calls, it can use
+   * {@link #captureState} to create a copy of the current attribute state.
+   * <p>
+   * This method is called for every token of a document, so an efficient
+   * implementation is crucial for good performance. To avoid calls to
+   * {@link #addAttribute(Class)} and {@link #getAttribute(Class)},
+   * references to all {@link AttributeImpl}s that this stream uses should be
+   * retrieved during instantiation.
+   * <p>
+   * To ensure that filters and consumers know which attributes are available,
+   * the attributes must be added during instantiation. Filters and consumers
+   * are not required to check for availability of attributes in
+   * {@link #incrementToken()}.
+   * 
+   * @return false for end of stream; true otherwise
+   */
+  public abstract boolean incrementToken() throws IOException;
+  
+  /**
+   * This method is called by the consumer after the last token has been
+   * consumed, after {@link #incrementToken()} returned <code>false</code>
+   * (using the new <code>TokenStream</code> API). Streams implementing the old API
+   * should upgrade to use this feature.
+   * <p/>
+   * This method can be used to perform any end-of-stream operations, such as
+   * setting the final offset of a stream. The final offset of a stream might
+   * differ from the offset of the last token eg in case one or more whitespaces
+   * followed after the last token, but a WhitespaceTokenizer was used.
+   * 
+   * @throws IOException If an I/O error occurs
+   */
+  public void end() throws IOException {
+    // do nothing by default
+  }
+
+  /**
+   * This method is called by a consumer before it begins consumption using
+   * {@link #incrementToken()}.
+   * <p/>
+   * Resets this stream to a clean state. Stateful implementations must implement
+   * this method so that they can be reused, just as if they had been created fresh.
+   */
+  public void reset() throws IOException {}
+  
+  /** Releases resources associated with this stream. */
+  public void close() throws IOException {}
+  
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/Tokenizer.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/Tokenizer.java
@ -0,0 +1,99 @@
+package com.fr.third.org.apache.lucene.analysis;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.fr.third.org.apache.lucene.util.AttributeSource;
+
+import java.io.Reader;
+import java.io.IOException;
+
+/** A Tokenizer is a TokenStream whose input is a Reader.
+  <p>
+  This is an abstract class; subclasses must override {@link #incrementToken()}
+  <p>
+  NOTE: Subclasses overriding {@link #incrementToken()} must
+  call {@link AttributeSource#clearAttributes()} before
+  setting attributes.
+ */
+public abstract class Tokenizer extends TokenStream {
+  /** The text source for this Tokenizer. */
+  protected Reader input;
+
+  /** Construct a token stream processing the given input. */
+  protected Tokenizer(Reader input) {
+    assert input != null: "input must not be null";
+    this.input = input;
+  }
+  
+  /** Construct a token stream processing the given input using the given AttributeFactory. */
+  protected Tokenizer(AttributeFactory factory, Reader input) {
+    super(factory);
+    assert input != null: "input must not be null";
+    this.input = input;
+  }
+
+  /** Construct a token stream processing the given input using the given AttributeSource. */
+  protected Tokenizer(AttributeSource source, Reader input) {
+    super(source);
+    assert input != null: "input must not be null";
+    this.input = input;
+  }
+  
+  /**
+   * {@inheritDoc}
+   * <p>
+   * <b>NOTE:</b> 
+   * The default implementation closes the input Reader, so
+   * be sure to call <code>super.close()</code> when overriding this method.
+   */
+  @Override
+  public void close() throws IOException {
+    if (input != null) {
+      input.close();
+      // LUCENE-2387: don't hold onto Reader after close, so
+      // GC can reclaim
+      input = null;
+    }
+  }
+  
+  /** Return the corrected offset. If {@link #input} is a {@link CharFilter} subclass
+   * this method calls {@link CharFilter#correctOffset}, else returns <code>currentOff</code>.
+   * @param currentOff offset as seen in the output
+   * @return corrected offset based on the input
+   * @see CharFilter#correctOffset
+   */
+  protected final int correctOffset(int currentOff) {
+    assert input != null: "this tokenizer is closed";
+    return (input instanceof CharFilter) ? ((CharFilter) input).correctOffset(currentOff) : currentOff;
+  }
+
+  /** Expert: Set a new reader on the Tokenizer.  Typically, an
+   *  analyzer (in its tokenStream method) will use
+   *  this to re-use a previously created tokenizer. */
+  public final void setReader(Reader input) throws IOException {
+    assert input != null: "input must not be null";
+    this.input = input;
+    assert setReaderTestPoint();
+  }
+  
+  // only used by assert, for testing
+  boolean setReaderTestPoint() {
+    return true;
+  }
+}
+
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
@ -0,0 +1,153 @@
+package com.fr.third.org.apache.lucene.analysis.ar;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import com.fr.third.org.apache.lucene.analysis.Analyzer;
+import com.fr.third.org.apache.lucene.analysis.core.LowerCaseFilter;
+import com.fr.third.org.apache.lucene.analysis.core.StopFilter;
+import com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer;
+import com.fr.third.org.apache.lucene.analysis.util.CharArraySet;
+import com.fr.third.org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import com.fr.third.org.apache.lucene.analysis.TokenStream;
+import com.fr.third.org.apache.lucene.analysis.Tokenizer;
+import com.fr.third.org.apache.lucene.util.Version;
+
+/**
+ * {@link Analyzer} for Arabic. 
+ * <p>
+ * This analyzer implements light-stemming as specified by:
+ * <i>
+ * Light Stemming for Arabic Information Retrieval
+ * </i>    
+ * http://www.mtholyoke.edu/~lballest/Pubs/arab_stem05.pdf
+ * <p>
+ * The analysis package contains three primary components:
+ * <ul>
+ *  <li>{@link ArabicNormalizationFilter}: Arabic orthographic normalization.
+ *  <li>{@link ArabicStemFilter}: Arabic light stemming
+ *  <li>Arabic stop words file: a set of default Arabic stop words.
+ * </ul>
+ * 
+ */
+public final class ArabicAnalyzer extends StopwordAnalyzerBase {
+
+  /**
+   * File containing default Arabic stopwords.
+   * 
+   * Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html
+   * The stopword list is BSD-Licensed.
+   */
+  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+  /**
+   * Returns an unmodifiable instance of the default stop-words set.
+   * @return an unmodifiable instance of the default stop-words set.
+   */
+  public static CharArraySet getDefaultStopSet(){
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+   * accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final CharArraySet DEFAULT_STOP_SET;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = loadStopwordSet(false, ArabicAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+  }
+  
+  private final CharArraySet stemExclusionSet;
+
+  /**
+   * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+   */
+  public ArabicAnalyzer(Version matchVersion) {
+    this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words
+   * 
+   * @param matchVersion
+   *          lucene compatibility version
+   * @param stopwords
+   *          a stopword set
+   */
+  public ArabicAnalyzer(Version matchVersion, CharArraySet stopwords){
+    this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+  }
+
+  /**
+   * Builds an analyzer with the given stop word. If a none-empty stem exclusion set is
+   * provided this analyzer will add a {@link KeywordMarkerFilter} before
+   * {@link ArabicStemFilter}.
+   * 
+   * @param matchVersion
+   *          lucene compatibility version
+   * @param stopwords
+   *          a stopword set
+   * @param stemExclusionSet
+   *          a set of terms not to be stemmed
+   */
+  public ArabicAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){
+    super(matchVersion, stopwords);
+    this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+        matchVersion, stemExclusionSet));
+  }
+
+  /**
+   * Creates
+   * {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+   * used to tokenize all the text in the provided {@link Reader}.
+   * 
+   * @return {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+   *         built from an {@link StandardTokenizer} filtered with
+   *         {@link LowerCaseFilter}, {@link StopFilter},
+   *         {@link ArabicNormalizationFilter}, {@link KeywordMarkerFilter}
+   *         if a stem exclusion set is provided and {@link ArabicStemFilter}.
+   */
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = matchVersion.onOrAfter(Version.LUCENE_31) ? 
+        new StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader);
+    TokenStream result = new LowerCaseFilter(matchVersion, source);
+    // the order here is important: the stopword list is not normalized!
+    result = new StopFilter( matchVersion, result, stopwords);
+    // TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?!
+    result = new ArabicNormalizationFilter(result);
+    if(!stemExclusionSet.isEmpty()) {
+      result = new KeywordMarkerFilter(result, stemExclusionSet);
+    }
+    return new TokenStreamComponents(source, new ArabicStemFilter(result));
+  }
+}
+
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java
@ -0,0 +1,96 @@
+package com.fr.third.org.apache.lucene.analysis.ar;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import com.fr.third.org.apache.lucene.analysis.core.LetterTokenizer;
+import com.fr.third.org.apache.lucene.analysis.util.CharTokenizer;
+import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer; // javadoc @link
+import com.fr.third.org.apache.lucene.util.AttributeSource;
+import com.fr.third.org.apache.lucene.util.Version;
+
+/**
+ * Tokenizer that breaks text into runs of letters and diacritics.
+ * <p>
+ * The problem with the standard Letter tokenizer is that it fails on diacritics.
+ * Handling similar to this is necessary for Indic Scripts, Hebrew, Thaana, etc.
+ * </p>
+ * <p>
+ * <a name="version"/>
+ * You must specify the required {@link Version} compatibility when creating
+ * {@link ArabicLetterTokenizer}:
+ * <ul>
+ * <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
+ * detect token characters. See {@link #isTokenChar(int)} and
+ * {@link #normalize(int)} for details.</li>
+ * </ul>
+ * @deprecated (3.1) Use {@link StandardTokenizer} instead.
+ */
+@Deprecated
+public class ArabicLetterTokenizer extends LetterTokenizer {
+  /**
+   * Construct a new ArabicLetterTokenizer.
+   * @param matchVersion Lucene version
+   * to match See {@link <a href="#version">above</a>}
+   * 
+   * @param in
+   *          the input to split up into tokens
+   */
+  public ArabicLetterTokenizer(Version matchVersion, Reader in) {
+    super(matchVersion, in);
+  }
+
+  /**
+   * Construct a new ArabicLetterTokenizer using a given {@link AttributeSource}.
+   * 
+   * @param matchVersion
+   *          Lucene version to match See {@link <a href="#version">above</a>}
+   * @param source
+   *          the attribute source to use for this Tokenizer
+   * @param in
+   *          the input to split up into tokens
+   */
+  public ArabicLetterTokenizer(Version matchVersion, AttributeSource source, Reader in) {
+    super(matchVersion, source, in);
+  }
+
+  /**
+   * Construct a new ArabicLetterTokenizer using a given
+   * {@link AttributeSource.AttributeFactory}. * @param
+   * matchVersion Lucene version to match See
+   * {@link <a href="#version">above</a>}
+   * 
+   * @param factory
+   *          the attribute factory to use for this Tokenizer
+   * @param in
+   *          the input to split up into tokens
+   */
+  public ArabicLetterTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
+    super(matchVersion, factory, in);
+  }
+  
+  /**
+   * Allows for Letter category or NonspacingMark category
+   * @see com.fr.third.org.apache.lucene.analysis.core.LetterTokenizer#isTokenChar(int)
+   */
+  @Override
+  protected boolean isTokenChar(int c) {
+    return super.isTokenChar(c) || Character.getType(c) == Character.NON_SPACING_MARK;
+  }
+
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicLetterTokenizerFactory.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicLetterTokenizerFactory.java
@ -0,0 +1,43 @@
+package com.fr.third.org.apache.lucene.analysis.ar;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.fr.third.org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
+import com.fr.third.org.apache.lucene.analysis.util.TokenizerFactory;
+
+import java.io.Reader;
+import java.util.Map;
+
+
+/**
+ * Factory for {@link ArabicLetterTokenizer}
+ * @deprecated (3.1) Use StandardTokenizerFactory instead.
+ **/
+@Deprecated
+public class ArabicLetterTokenizerFactory extends TokenizerFactory {
+
+  @Override
+  public void init(Map<String,String> args) {
+    super.init(args);
+    assureMatchVersion();
+  }
+
+  public ArabicLetterTokenizer create(Reader input) {
+    return new ArabicLetterTokenizer(luceneMatchVersion, input);
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java
@ -0,0 +1,48 @@
+package com.fr.third.org.apache.lucene.analysis.ar;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import com.fr.third.org.apache.lucene.analysis.TokenFilter;
+import com.fr.third.org.apache.lucene.analysis.TokenStream;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link ArabicNormalizer} to normalize the orthography.
+ * 
+ */
+
+public final class ArabicNormalizationFilter extends TokenFilter {
+  private final ArabicNormalizer normalizer = new ArabicNormalizer();
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  
+  public ArabicNormalizationFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      int newlen = normalizer.normalize(termAtt.buffer(), termAtt.length());
+      termAtt.setLength(newlen);
+      return true;
+    }
+    return false;
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicNormalizationFilterFactory.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicNormalizationFilterFactory.java
@ -0,0 +1,48 @@
+package com.fr.third.org.apache.lucene.analysis.ar;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.fr.third.org.apache.lucene.analysis.TokenStream;
+import com.fr.third.org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
+import com.fr.third.org.apache.lucene.analysis.util.AbstractAnalysisFactory;
+import com.fr.third.org.apache.lucene.analysis.util.MultiTermAwareComponent;
+import com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory;
+
+
+/**
+ * Factory for {@link ArabicNormalizationFilter}.
+ * <pre class="prettyprint" >
+ * &lt;fieldType name="text_arnormal" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.ArabicNormalizationFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ *
+ */
+public class ArabicNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
+
+  public ArabicNormalizationFilter create(TokenStream input) {
+    return new ArabicNormalizationFilter(input);
+  }
+
+  @Override
+  public AbstractAnalysisFactory getMultiTermComponent() {
+    return this;
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicNormalizer.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicNormalizer.java
@ -0,0 +1,101 @@
+package com.fr.third.org.apache.lucene.analysis.ar;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static com.fr.third.org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ *  Normalizer for Arabic.
+ *  <p>
+ *  Normalization is done in-place for efficiency, operating on a termbuffer.
+ *  <p>
+ *  Normalization is defined as:
+ *  <ul>
+ *  <li> Normalization of hamza with alef seat to a bare alef.
+ *  <li> Normalization of teh marbuta to heh
+ *  <li> Normalization of dotless yeh (alef maksura) to yeh.
+ *  <li> Removal of Arabic diacritics (the harakat)
+ *  <li> Removal of tatweel (stretching character).
+ * </ul>
+ *
+ */
+public class ArabicNormalizer {
+  public static final char ALEF = '\u0627';
+  public static final char ALEF_MADDA = '\u0622';
+  public static final char ALEF_HAMZA_ABOVE = '\u0623';
+  public static final char ALEF_HAMZA_BELOW = '\u0625';
+
+  public static final char YEH = '\u064A';
+  public static final char DOTLESS_YEH = '\u0649';
+
+  public static final char TEH_MARBUTA = '\u0629';
+  public static final char HEH = '\u0647';
+
+  public static final char TATWEEL = '\u0640';
+
+  public static final char FATHATAN = '\u064B';
+  public static final char DAMMATAN = '\u064C';
+  public static final char KASRATAN = '\u064D';
+  public static final char FATHA = '\u064E';
+  public static final char DAMMA = '\u064F';
+  public static final char KASRA = '\u0650';
+  public static final char SHADDA = '\u0651';
+  public static final char SUKUN = '\u0652';
+
+  /**
+   * Normalize an input buffer of Arabic text
+   * 
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   */
+  public int normalize(char s[], int len) {
+
+    for (int i = 0; i < len; i++) {
+      switch (s[i]) {
+      case ALEF_MADDA:
+      case ALEF_HAMZA_ABOVE:
+      case ALEF_HAMZA_BELOW:
+        s[i] = ALEF;
+        break;
+      case DOTLESS_YEH:
+        s[i] = YEH;
+        break;
+      case TEH_MARBUTA:
+        s[i] = HEH;
+        break;
+      case TATWEEL:
+      case KASRATAN:
+      case DAMMATAN:
+      case FATHATAN:
+      case FATHA:
+      case DAMMA:
+      case KASRA:
+      case SHADDA:
+      case SUKUN:
+        len = delete(s, i, len);
+        i--;
+        break;
+      default:
+        break;
+      }
+    }
+
+    return len;
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicStemFilter.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicStemFilter.java
@ -0,0 +1,58 @@
+package com.fr.third.org.apache.lucene.analysis.ar;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // javadoc @link
+import com.fr.third.org.apache.lucene.analysis.TokenFilter;
+import com.fr.third.org.apache.lucene.analysis.TokenStream;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link ArabicStemmer} to stem Arabic words..
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ * @see KeywordMarkerFilter */
+
+public final class ArabicStemFilter extends TokenFilter {
+  private final ArabicStemmer stemmer = new ArabicStemmer();
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+  
+  public ArabicStemFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if(!keywordAttr.isKeyword()) {
+        final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+        termAtt.setLength(newlen);
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicStemFilterFactory.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicStemFilterFactory.java
@ -0,0 +1,43 @@
+package com.fr.third.org.apache.lucene.analysis.ar;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.fr.third.org.apache.lucene.analysis.TokenStream;
+import com.fr.third.org.apache.lucene.analysis.ar.ArabicStemFilter;
+import com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory;
+
+
+/**
+ * Factory for {@link ArabicStemFilter}.
+ * <pre class="prettyprint" >
+ * &lt;fieldType name="text_arstem" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.ArabicNormalizationFilterFactory"/&gt;
+ *     &lt;filter class="solr.ArabicStemFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ *
+ */
+public class ArabicStemFilterFactory extends TokenFilterFactory {
+
+
+  public ArabicStemFilter create(TokenStream input) {
+    return new ArabicStemFilter(input);
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicStemmer.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/ArabicStemmer.java
@ -0,0 +1,150 @@
+package com.fr.third.org.apache.lucene.analysis.ar;
+
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static com.fr.third.org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ *  Stemmer for Arabic.
+ *  <p>
+ *  Stemming  is done in-place for efficiency, operating on a termbuffer.
+ *  <p>
+ *  Stemming is defined as:
+ *  <ul>
+ *  <li> Removal of attached definite article, conjunction, and prepositions.
+ *  <li> Stemming of common suffixes.
+ * </ul>
+ *
+ */
+public class ArabicStemmer {
+  public static final char ALEF = '\u0627';
+  public static final char BEH = '\u0628';
+  public static final char TEH_MARBUTA = '\u0629';
+  public static final char TEH = '\u062A';
+  public static final char FEH = '\u0641';
+  public static final char KAF = '\u0643';
+  public static final char LAM = '\u0644';
+  public static final char NOON = '\u0646';
+  public static final char HEH = '\u0647';
+  public static final char WAW = '\u0648';
+  public static final char YEH = '\u064A';
+  
+  public static final char prefixes[][] = {
+      ("" + ALEF + LAM).toCharArray(), 
+      ("" + WAW + ALEF + LAM).toCharArray(), 
+      ("" + BEH + ALEF + LAM).toCharArray(),
+      ("" + KAF + ALEF + LAM).toCharArray(),
+      ("" + FEH + ALEF + LAM).toCharArray(),
+      ("" + LAM + LAM).toCharArray(),
+      ("" + WAW).toCharArray(),
+  };
+  
+  public static final char suffixes[][] = {
+    ("" + HEH + ALEF).toCharArray(), 
+    ("" + ALEF + NOON).toCharArray(), 
+    ("" + ALEF + TEH).toCharArray(), 
+    ("" + WAW + NOON).toCharArray(), 
+    ("" + YEH + NOON).toCharArray(), 
+    ("" + YEH + HEH).toCharArray(),
+    ("" + YEH + TEH_MARBUTA).toCharArray(),
+    ("" + HEH).toCharArray(),
+    ("" + TEH_MARBUTA).toCharArray(),
+    ("" + YEH).toCharArray(),
+};
+  
+  /**
+   * Stem an input buffer of Arabic text.
+   * 
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   */
+  public int stem(char s[], int len) {
+    len = stemPrefix(s, len);
+    len = stemSuffix(s, len);
+    
+    return len;
+  }
+  
+  /**
+   * Stem a prefix off an Arabic word.
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return new length of input buffer after stemming.
+   */
+  public int stemPrefix(char s[], int len) {
+    for (int i = 0; i < prefixes.length; i++) 
+      if (startsWithCheckLength(s, len, prefixes[i]))
+        return deleteN(s, 0, len, prefixes[i].length);
+    return len;
+  }
+
+  /**
+   * Stem suffix(es) off an Arabic word.
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return new length of input buffer after stemming
+   */
+  public int stemSuffix(char s[], int len) {
+    for (int i = 0; i < suffixes.length; i++) 
+      if (endsWithCheckLength(s, len, suffixes[i]))
+        len = deleteN(s, len - suffixes[i].length, len, suffixes[i].length);
+    return len;
+  }
+  
+  /**
+   * Returns true if the prefix matches and can be stemmed
+   * @param s input buffer
+   * @param len length of input buffer
+   * @param prefix prefix to check
+   * @return true if the prefix matches and can be stemmed
+   */
+  boolean startsWithCheckLength(char s[], int len, char prefix[]) {
+    if (prefix.length == 1 && len < 4) { // wa- prefix requires at least 3 characters
+      return false;
+    } else if (len < prefix.length + 2) { // other prefixes require only 2.
+      return false;
+    } else {
+      for (int i = 0; i < prefix.length; i++)
+        if (s[i] != prefix[i])
+          return false;
+        
+      return true;
+    }
+  }
+  
+  /**
+   * Returns true if the suffix matches and can be stemmed
+   * @param s input buffer
+   * @param len length of input buffer
+   * @param suffix suffix to check
+   * @return true if the suffix matches and can be stemmed
+   */
+  boolean endsWithCheckLength(char s[], int len, char suffix[]) {
+    if (len < suffix.length + 2) { // all suffixes require at least 2 characters after stemming
+      return false;
+    } else {
+      for (int i = 0; i < suffix.length; i++)
+        if (s[len - suffix.length + i] != suffix[i])
+          return false;
+        
+      return true;
+    }
+  }  
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/package.html
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ar/package.html
@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Arabic.
+</body>
+</html>
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
@ -0,0 +1,131 @@
+package com.fr.third.org.apache.lucene.analysis.bg;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Set;
+
+import com.fr.third.org.apache.lucene.analysis.Analyzer;
+import com.fr.third.org.apache.lucene.analysis.core.LowerCaseFilter;
+import com.fr.third.org.apache.lucene.analysis.core.StopFilter;
+import com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import com.fr.third.org.apache.lucene.analysis.TokenStream;
+import com.fr.third.org.apache.lucene.analysis.Tokenizer;
+import com.fr.third.org.apache.lucene.analysis.standard.StandardFilter;
+import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer;
+import com.fr.third.org.apache.lucene.analysis.util.CharArraySet;
+import com.fr.third.org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import com.fr.third.org.apache.lucene.util.Version;
+
+/**
+ * {@link Analyzer} for Bulgarian.
+ * <p>
+ * This analyzer implements light-stemming as specified by: <i> Searching
+ * Strategies for the Bulgarian Language </i>
+ * http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
+ * <p>
+ */
+public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
+  /**
+   * File containing default Bulgarian stopwords.
+   * 
+   * Default stopword list is from
+   * http://members.unine.ch/jacques.savoy/clef/index.html The stopword list is
+   * BSD-Licensed.
+   */
+  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+  /**
+   * Returns an unmodifiable instance of the default stop-words set.
+   * 
+   * @return an unmodifiable instance of the default stop-words set.
+   */
+  public static CharArraySet getDefaultStopSet() {
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer
+   * class accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final CharArraySet DEFAULT_STOP_SET;
+    
+    static {
+      try {
+        DEFAULT_STOP_SET = loadStopwordSet(false, BulgarianAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+  }
+  
+  private final CharArraySet stemExclusionSet;
+   
+  /**
+   * Builds an analyzer with the default stop words:
+   * {@link #DEFAULT_STOPWORD_FILE}.
+   */
+  public BulgarianAnalyzer(Version matchVersion) {
+    this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words.
+   */
+  public BulgarianAnalyzer(Version matchVersion, CharArraySet stopwords) {
+    this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words and a stem exclusion set.
+   * If a stem exclusion set is provided this analyzer will add a {@link KeywordMarkerFilter} 
+   * before {@link BulgarianStemFilter}.
+   */
+  public BulgarianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
+    super(matchVersion, stopwords);
+    this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+        matchVersion, stemExclusionSet));  }
+
+  /**
+   * Creates a
+   * {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+   * which tokenizes all the text in the provided {@link Reader}.
+   * 
+   * @return A
+   *         {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+   *         built from an {@link StandardTokenizer} filtered with
+   *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
+   *         , {@link KeywordMarkerFilter} if a stem exclusion set is
+   *         provided and {@link BulgarianStemFilter}.
+   */
+  @Override
+  public TokenStreamComponents createComponents(String fieldName, Reader reader) {
+    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+    TokenStream result = new StandardFilter(matchVersion, source);
+    result = new LowerCaseFilter(matchVersion, result);
+    result = new StopFilter(matchVersion, result, stopwords);
+    if(!stemExclusionSet.isEmpty())
+      result = new KeywordMarkerFilter(result, stemExclusionSet);
+    result = new BulgarianStemFilter(result);
+    return new TokenStreamComponents(source, result);
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/bg/BulgarianStemFilter.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/bg/BulgarianStemFilter.java
@ -0,0 +1,58 @@
+package com.fr.third.org.apache.lucene.analysis.bg;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // for javadoc
+import com.fr.third.org.apache.lucene.analysis.TokenFilter;
+import com.fr.third.org.apache.lucene.analysis.TokenStream;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link BulgarianStemmer} to stem Bulgarian
+ * words.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ */
+public final class BulgarianStemFilter extends TokenFilter {
+  private final BulgarianStemmer stemmer = new BulgarianStemmer();
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+  
+  public BulgarianStemFilter(final TokenStream input) {
+    super(input);
+  }
+  
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      if(!keywordAttr.isKeyword()) {
+        final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+        termAtt.setLength(newlen);
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/bg/BulgarianStemFilterFactory.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/bg/BulgarianStemFilterFactory.java
@ -0,0 +1,40 @@
+package com.fr.third.org.apache.lucene.analysis.bg;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.fr.third.org.apache.lucene.analysis.TokenStream;
+import com.fr.third.org.apache.lucene.analysis.bg.BulgarianStemFilter;
+import com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/** 
+ * Factory for {@link BulgarianStemFilter}.
+ * <pre class="prettyprint" >
+ * &lt;fieldType name="text_bgstem" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
+ *     &lt;filter class="solr.BulgarianStemFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ *
+ */
+public class BulgarianStemFilterFactory extends TokenFilterFactory {
+  public TokenStream create(TokenStream input) {
+    return new BulgarianStemFilter(input);
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/bg/BulgarianStemmer.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/bg/BulgarianStemmer.java
@ -0,0 +1,143 @@
+package com.fr.third.org.apache.lucene.analysis.bg;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static com.fr.third.org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ * Light Stemmer for Bulgarian.
+ * <p>
+ * Implements the algorithm described in:  
+ * <i>
+ * Searching Strategies for the Bulgarian Language
+ * </i>
+ * http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
+ */
+public class BulgarianStemmer {
+  
+  /**
+   * Stem an input buffer of Bulgarian text.
+   * 
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   */
+  public int stem(final char s[], int len) {
+    if (len < 4) // do not stem
+      return len;
+    
+    if (len > 5 && endsWith(s, len, "ища"))
+      return len - 3;
+    
+    len = removeArticle(s, len);
+    len = removePlural(s, len);
+    
+    if (len > 3) {
+      if (endsWith(s, len, "я"))
+        len--;
+      if (endsWith(s, len, "а") ||
+          endsWith(s, len, "о") ||
+          endsWith(s, len, "е"))
+        len--;
+    }
+    
+    // the rule to rewrite ен -> н is duplicated in the paper.
+    // in the perl implementation referenced by the paper, this is fixed.
+    // (it is fixed here as well)
+    if (len > 4 && endsWith(s, len, "ен")) {
+      s[len - 2] = 'н'; // replace with н
+      len--;
+    }
+    
+    if (len > 5 && s[len - 2] == 'ъ') {
+      s[len - 2] = s[len - 1]; // replace ъN with N
+      len--;
+    }
+
+    return len;
+  }
+  
+  /**
+   * Mainly remove the definite article
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return new stemmed length
+   */
+  private int removeArticle(final char s[], final int len) {
+    if (len > 6 && endsWith(s, len, "ият"))
+      return len - 3;
+    
+    if (len > 5) {
+      if (endsWith(s, len, "ът") ||
+          endsWith(s, len, "то") ||
+          endsWith(s, len, "те") ||
+          endsWith(s, len, "та") ||
+          endsWith(s, len, "ия"))
+        return len - 2;
+    }
+    
+    if (len > 4 && endsWith(s, len, "ят"))
+      return len - 2;
+
+    return len;
+  }
+  
+  private int removePlural(final char s[], final int len) {
+    if (len > 6) {
+      if (endsWith(s, len, "овци"))
+        return len - 3; // replace with о
+      if (endsWith(s, len, "ове"))
+        return len - 3;
+      if (endsWith(s, len, "еве")) {
+        s[len - 3] = 'й'; // replace with й
+        return len - 2;
+      }
+    }
+    
+    if (len > 5) {
+      if (endsWith(s, len, "ища"))
+        return len - 3;
+      if (endsWith(s, len, "та"))
+        return len - 2;
+      if (endsWith(s, len, "ци")) {
+        s[len - 2] = 'к'; // replace with к
+        return len - 1;
+      }
+      if (endsWith(s, len, "зи")) {
+        s[len - 2] = 'г'; // replace with г
+        return len - 1;
+      }
+      
+      if (s[len - 3] == 'е' && s[len - 1] == 'и') {
+        s[len - 3] = 'я'; // replace е with я, remove и
+        return len - 1;
+      }
+    }
+    
+    if (len > 4) {
+      if (endsWith(s, len, "си")) {
+        s[len - 2] = 'х'; // replace with х
+        return len - 1;
+      }
+      if (endsWith(s, len, "и"))
+        return len - 1;
+    }
+    
+    return len;
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/bg/package.html
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/bg/package.html
@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Bulgarian.
+</body>
+</html>
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
@ -0,0 +1,138 @@
+package com.fr.third.org.apache.lucene.analysis.br;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import com.fr.third.org.apache.lucene.analysis.Analyzer;
+import com.fr.third.org.apache.lucene.analysis.TokenStream;
+import com.fr.third.org.apache.lucene.analysis.Tokenizer;
+import com.fr.third.org.apache.lucene.analysis.core.LowerCaseFilter;
+import com.fr.third.org.apache.lucene.analysis.core.StopFilter;
+import com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import com.fr.third.org.apache.lucene.analysis.standard.StandardAnalyzer;
+import com.fr.third.org.apache.lucene.analysis.standard.StandardFilter;
+import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer;
+import com.fr.third.org.apache.lucene.analysis.util.CharArraySet;
+import com.fr.third.org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import com.fr.third.org.apache.lucene.analysis.util.WordlistLoader;
+import com.fr.third.org.apache.lucene.util.IOUtils;
+import com.fr.third.org.apache.lucene.util.Version;
+
+/**
+ * {@link Analyzer} for Brazilian Portuguese language. 
+ * <p>
+ * Supports an external list of stopwords (words that
+ * will not be indexed at all) and an external list of exclusions (words that will
+ * not be stemmed, but indexed).
+ * </p>
+ *
+ * <p><b>NOTE</b>: This class uses the same {@link Version}
+ * dependent settings as {@link StandardAnalyzer}.</p>
+ */
+public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
+  /** File containing default Brazilian Portuguese stopwords. */
+  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+  
+  /**
+   * Returns an unmodifiable instance of the default stop-words set.
+   * @return an unmodifiable instance of the default stop-words set.
+   */
+  public static CharArraySet getDefaultStopSet(){
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  private static class DefaultSetHolder {
+    static final CharArraySet DEFAULT_STOP_SET;
+    
+    static {
+      try {
+        DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(BrazilianAnalyzer.class, 
+            DEFAULT_STOPWORD_FILE, IOUtils.CHARSET_UTF_8), "#", Version.LUCENE_CURRENT);
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+  }
+
+
+  /**
+   * Contains words that should be indexed but not stemmed.
+   */
+  private CharArraySet excltable = CharArraySet.EMPTY_SET;
+
+  /**
+   * Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
+   */
+  public BrazilianAnalyzer(Version matchVersion) {
+    this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+  }
+
+  /**
+   * Builds an analyzer with the given stop words
+   * 
+   * @param matchVersion
+   *          lucene compatibility version
+   * @param stopwords
+   *          a stopword set
+   */
+  public BrazilianAnalyzer(Version matchVersion, CharArraySet stopwords) {
+     super(matchVersion, stopwords);
+  }
+
+  /**
+   * Builds an analyzer with the given stop words and stemming exclusion words
+   * 
+   * @param matchVersion
+   *          lucene compatibility version
+   * @param stopwords
+   *          a stopword set
+   */
+  public BrazilianAnalyzer(Version matchVersion, CharArraySet stopwords,
+      CharArraySet stemExclusionSet) {
+    this(matchVersion, stopwords);
+    excltable = CharArraySet.unmodifiableSet(CharArraySet
+        .copy(matchVersion, stemExclusionSet));
+  }
+
+  /**
+   * Creates
+   * {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+   * used to tokenize all the text in the provided {@link Reader}.
+   * 
+   * @return {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+   *         built from a {@link StandardTokenizer} filtered with
+   *         {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}
+   *         , and {@link BrazilianStemFilter}.
+   */
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    Tokenizer source = new StandardTokenizer(matchVersion, reader);
+    TokenStream result = new LowerCaseFilter(matchVersion, source);
+    result = new StandardFilter(matchVersion, result);
+    result = new StopFilter(matchVersion, result, stopwords);
+    if(excltable != null && !excltable.isEmpty())
+      result = new KeywordMarkerFilter(result, excltable);
+    return new TokenStreamComponents(source, new BrazilianStemFilter(result));
+  }
+}
+
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/br/BrazilianStemFilter.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/br/BrazilianStemFilter.java
@ -0,0 +1,76 @@
+package com.fr.third.org.apache.lucene.analysis.br;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Set;
+
+import com.fr.third.org.apache.lucene.analysis.TokenFilter;
+import com.fr.third.org.apache.lucene.analysis.TokenStream;
+import com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link BrazilianStemmer}.
+ * <p>
+ * To prevent terms from being stemmed use an instance of
+ * {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
+ * the {@link KeywordAttribute} before this {@link TokenStream}.
+ * </p>
+ * @see KeywordMarkerFilter
+ * 
+ */
+public final class BrazilianStemFilter extends TokenFilter {
+
+  /**
+   * {@link BrazilianStemmer} in use by this filter.
+   */
+  private BrazilianStemmer stemmer = new BrazilianStemmer();
+  private Set<?> exclusions = null;
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+
+  /**
+   * Creates a new BrazilianStemFilter 
+   * 
+   * @param in the source {@link TokenStream} 
+   */
+  public BrazilianStemFilter(TokenStream in) {
+    super(in);
+  }
+  
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      final String term = termAtt.toString();
+      // Check the exclusion table.
+      if (!keywordAttr.isKeyword() && (exclusions == null || !exclusions.contains(term))) {
+        final String s = stemmer.stem(term);
+        // If not stemmed, don't waste the time adjusting the token.
+        if ((s != null) && !s.equals(term))
+          termAtt.setEmpty().append(s);
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
+
+
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/br/BrazilianStemFilterFactory.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/br/BrazilianStemFilterFactory.java
@ -0,0 +1,41 @@
+package com.fr.third.org.apache.lucene.analysis.br;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.fr.third.org.apache.lucene.analysis.TokenStream;
+import com.fr.third.org.apache.lucene.analysis.br.BrazilianStemFilter;
+import com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/** 
+ * Factory for {@link BrazilianStemFilter}.
+ * <pre class="prettyprint" >
+ * &lt;fieldType name="text_brstem" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
+ *     &lt;filter class="solr.BrazilianStemFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ *
+ */
+public class BrazilianStemFilterFactory extends TokenFilterFactory {
+  public BrazilianStemFilter create(TokenStream in) {
+    return new BrazilianStemFilter(in);
+  }
+}
+
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/br/BrazilianStemmer.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/br/BrazilianStemmer.java
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/br/package.html
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/br/package.html
@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Brazilian Portuguese.
+</body>
+</html>
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
@ -0,0 +1,148 @@
+package com.fr.third.org.apache.lucene.analysis.ca;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Arrays;
+
+import com.fr.third.org.apache.lucene.analysis.Analyzer;
+import com.fr.third.org.apache.lucene.analysis.core.LowerCaseFilter;
+import com.fr.third.org.apache.lucene.analysis.core.StopFilter;
+import com.fr.third.org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
+import com.fr.third.org.apache.lucene.analysis.TokenStream;
+import com.fr.third.org.apache.lucene.analysis.Tokenizer;
+import com.fr.third.org.apache.lucene.analysis.snowball.SnowballFilter;
+import com.fr.third.org.apache.lucene.analysis.standard.StandardFilter;
+import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer;
+import com.fr.third.org.apache.lucene.analysis.util.CharArraySet;
+import com.fr.third.org.apache.lucene.analysis.util.ElisionFilter;
+import com.fr.third.org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import com.fr.third.org.apache.lucene.util.Version;
+import com.fr.third.org.tartarus.snowball.ext.CatalanStemmer;
+
+/**
+ * {@link Analyzer} for Catalan.
+ * <p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating CatalanAnalyzer:
+ * <ul>
+ *   <li> As of 3.6, ElisionFilter with a set of Catalan 
+ *        contractions is used by default.
+ * </ul>
+ */
+public final class CatalanAnalyzer extends StopwordAnalyzerBase {
+  private final CharArraySet stemExclusionSet;
+  
+  /** File containing default Catalan stopwords. */
+  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+  
+  private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
+      new CharArraySet(Version.LUCENE_CURRENT, 
+          Arrays.asList(
+              "d", "l", "m", "n", "s", "t"
+          ), true));
+  
+  /**
+   * Returns an unmodifiable instance of the default stop words set.
+   * @return default stop words set.
+   */
+  public static CharArraySet getDefaultStopSet(){
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+   * accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final CharArraySet DEFAULT_STOP_SET;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = loadStopwordSet(false, 
+            CatalanAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+  }
+
+  /**
+   * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+   */
+  public CatalanAnalyzer(Version matchVersion) {
+    this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words.
+   * 
+   * @param matchVersion lucene compatibility version
+   * @param stopwords a stopword set
+   */
+  public CatalanAnalyzer(Version matchVersion, CharArraySet stopwords) {
+    this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
+  }
+
+  /**
+   * Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+   * provided this analyzer will add a {@link KeywordMarkerFilter} before
+   * stemming.
+   * 
+   * @param matchVersion lucene compatibility version
+   * @param stopwords a stopword set
+   * @param stemExclusionSet a set of terms not to be stemmed
+   */
+  public CatalanAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
+    super(matchVersion, stopwords);
+    this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
+        matchVersion, stemExclusionSet));
+  }
+
+  /**
+   * Creates a
+   * {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+   * which tokenizes all the text in the provided {@link Reader}.
+   * 
+   * @return A
+   *         {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+   *         built from an {@link StandardTokenizer} filtered with
+   *         {@link StandardFilter}, {@link ElisionFilter}, {@link LowerCaseFilter}, 
+   *         {@link StopFilter}, {@link KeywordMarkerFilter} if a stem exclusion set is
+   *         provided and {@link SnowballFilter}.
+   */
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+    TokenStream result = new StandardFilter(matchVersion, source);
+    if (matchVersion.onOrAfter(Version.LUCENE_36)) {
+      result = new ElisionFilter(result, DEFAULT_ARTICLES);
+    }
+    result = new LowerCaseFilter(matchVersion, result);
+    result = new StopFilter(matchVersion, result, stopwords);
+    if(!stemExclusionSet.isEmpty())
+      result = new KeywordMarkerFilter(result, stemExclusionSet);
+    result = new SnowballFilter(result, new CatalanStemmer());
+    return new TokenStreamComponents(source, result);
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ca/package.html
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/ca/package.html
@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Catalan.
+</body>
+</html>
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/BaseCharFilter.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/BaseCharFilter.java
@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.fr.third.org.apache.lucene.analysis.charfilter;
+
+import com.fr.third.org.apache.lucene.analysis.CharFilter;
+import com.fr.third.org.apache.lucene.util.ArrayUtil;
+
+import java.io.Reader;
+import java.util.Arrays;
+
+/**
+ * Base utility class for implementing a {@link CharFilter}.
+ * You subclass this, and then record mappings by calling
+ * {@link #addOffCorrectMap}, and then invoke the correct
+ * method to correct an offset.
+ */
+public abstract class BaseCharFilter extends CharFilter {
+
+  private int offsets[];
+  private int diffs[];
+  private int size = 0;
+  
+  public BaseCharFilter(Reader in) {
+    super(in);
+  }
+
+  /** Retrieve the corrected offset. */
+  @Override
+  protected int correct(int currentOff) {
+    if (offsets == null || currentOff < offsets[0]) {
+      return currentOff;
+    }
+    
+    int hi = size - 1;
+    if(currentOff >= offsets[hi])
+      return currentOff + diffs[hi];
+
+    int lo = 0;
+    int mid = -1;
+    
+    while (hi >= lo) {
+      mid = (lo + hi) >>> 1;
+      if (currentOff < offsets[mid])
+        hi = mid - 1;
+      else if (currentOff > offsets[mid])
+        lo = mid + 1;
+      else
+        return currentOff + diffs[mid];
+    }
+
+    if (currentOff < offsets[mid])
+      return mid == 0 ? currentOff : currentOff + diffs[mid-1];
+    else
+      return currentOff + diffs[mid];
+  }
+  
+  protected int getLastCumulativeDiff() {
+    return offsets == null ?
+      0 : diffs[size-1];
+  }
+
+  /**
+   * <p>
+   *   Adds an offset correction mapping at the given output stream offset.
+   * </p>
+   * <p>
+   *   Assumption: the offset given with each successive call to this method
+   *   will not be smaller than the offset given at the previous invocation.
+   * </p>
+   *
+   * @param off The output stream offset at which to apply the correction
+   * @param cumulativeDiff The input offset is given by adding this
+   *                       to the output offset
+   */
+  protected void addOffCorrectMap(int off, int cumulativeDiff) {
+    if (offsets == null) {
+      offsets = new int[64];
+      diffs = new int[64];
+    } else if (size == offsets.length) {
+      offsets = ArrayUtil.grow(offsets);
+      diffs = ArrayUtil.grow(diffs);
+    }
+    
+    assert (size == 0 || off >= offsets[size - 1])
+        : "Offset #" + size + "(" + off + ") is less than the last recorded offset "
+          + offsets[size - 1] + "\n" + Arrays.toString(offsets) + "\n" + Arrays.toString(diffs);
+    
+    if (size == 0 || off != offsets[size - 1]) {
+      offsets[size] = off;
+      diffs[size++] = cumulativeDiff;
+    } else { // Overwrite the diff at the last recorded offset
+      diffs[size - 1] = cumulativeDiff;
+    }
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
@ -0,0 +1,162 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+CharacterEntities = ( "AElig" | "Aacute" | "Acirc" | "Agrave" | "Alpha"
+                    | "Aring" | "Atilde" | "Auml" | "Beta" | "Ccedil" | "Chi"
+                    | "Dagger" | "Delta" | "ETH" | "Eacute" | "Ecirc"
+                    | "Egrave" | "Epsilon" | "Eta" | "Euml" | "Gamma"
+                    | "Iacute" | "Icirc" | "Igrave" | "Iota" | "Iuml" | "Kappa"
+                    | "Lambda" | "Mu" | "Ntilde" | "Nu" | "OElig" | "Oacute"
+                    | "Ocirc" | "Ograve" | "Omega" | "Omicron" | "Oslash"
+                    | "Otilde" | "Ouml" | "Phi" | "Pi" | "Prime" | "Psi"
+                    | "Rho" | "Scaron" | "Sigma" | "THORN" | "Tau" | "Theta"
+                    | "Uacute" | "Ucirc" | "Ugrave" | "Upsilon" | "Uuml" | "Xi"
+                    | "Yacute" | "Yuml" | "Zeta" | "aacute" | "acirc" | "acute"
+                    | "aelig" | "agrave" | "alefsym" | "alpha" | "amp" | "AMP"
+                    | "and" | "ang" | "apos" | "aring" | "asymp" | "atilde"
+                    | "auml" | "bdquo" | "beta" | "brvbar" | "bull" | "cap"
+                    | "ccedil" | "cedil" | "cent" | "chi" | "circ" | "clubs"
+                    | "cong" | "copy" | "COPY" | "crarr" | "cup" | "curren"
+                    | "dArr" | "dagger" | "darr" | "deg" | "delta" | "diams"
+                    | "divide" | "eacute" | "ecirc" | "egrave" | "empty"
+                    | "emsp" | "ensp" | "epsilon" | "equiv" | "eta" | "eth"
+                    | "euml" | "euro" | "exist" | "fnof" | "forall" | "frac12"
+                    | "frac14" | "frac34" | "frasl" | "gamma" | "ge" | "gt"
+                    | "GT" | "hArr" | "harr" | "hearts" | "hellip" | "iacute"
+                    | "icirc" | "iexcl" | "igrave" | "image" | "infin" | "int"
+                    | "iota" | "iquest" | "isin" | "iuml" | "kappa" | "lArr"
+                    | "lambda" | "lang" | "laquo" | "larr" | "lceil" | "ldquo"
+                    | "le" | "lfloor" | "lowast" | "loz" | "lrm" | "lsaquo"
+                    | "lsquo" | "lt" | "LT" | "macr" | "mdash" | "micro"
+                    | "middot" | "minus" | "mu" | "nabla" | "nbsp" | "ndash"
+                    | "ne" | "ni" | "not" | "notin" | "nsub" | "ntilde" | "nu"
+                    | "oacute" | "ocirc" | "oelig" | "ograve" | "oline"
+                    | "omega" | "omicron" | "oplus" | "or" | "ordf" | "ordm"
+                    | "oslash" | "otilde" | "otimes" | "ouml" | "para" | "part"
+                    | "permil" | "perp" | "phi" | "pi" | "piv" | "plusmn"
+                    | "pound" | "prime" | "prod" | "prop" | "psi" | "quot"
+                    | "QUOT" | "rArr" | "radic" | "rang" | "raquo" | "rarr"
+                    | "rceil" | "rdquo" | "real" | "reg" | "REG" | "rfloor"
+                    | "rho" | "rlm" | "rsaquo" | "rsquo" | "sbquo" | "scaron"
+                    | "sdot" | "sect" | "shy" | "sigma" | "sigmaf" | "sim"
+                    | "spades" | "sub" | "sube" | "sum" | "sup" | "sup1"
+                    | "sup2" | "sup3" | "supe" | "szlig" | "tau" | "there4"
+                    | "theta" | "thetasym" | "thinsp" | "thorn" | "tilde"
+                    | "times" | "trade" | "uArr" | "uacute" | "uarr" | "ucirc"
+                    | "ugrave" | "uml" | "upsih" | "upsilon" | "uuml"
+                    | "weierp" | "xi" | "yacute" | "yen" | "yuml" | "zeta"
+                    | "zwj" | "zwnj" )
+%{
+  private static final Map<String,String> upperCaseVariantsAccepted
+      = new HashMap<String,String>();
+  static {
+    upperCaseVariantsAccepted.put("quot", "QUOT");
+    upperCaseVariantsAccepted.put("copy", "COPY");
+    upperCaseVariantsAccepted.put("gt", "GT");
+    upperCaseVariantsAccepted.put("lt", "LT");
+    upperCaseVariantsAccepted.put("reg", "REG");
+    upperCaseVariantsAccepted.put("amp", "AMP");
+  }
+  private static final CharArrayMap<Character> entityValues
+      = new CharArrayMap<Character>(Version.LUCENE_40, 253, false);
+  static {
+    String[] entities = {
+      "AElig", "\u00C6", "Aacute", "\u00C1", "Acirc", "\u00C2",
+      "Agrave", "\u00C0", "Alpha", "\u0391", "Aring", "\u00C5",
+      "Atilde", "\u00C3", "Auml", "\u00C4", "Beta", "\u0392",
+      "Ccedil", "\u00C7", "Chi", "\u03A7", "Dagger", "\u2021",
+      "Delta", "\u0394", "ETH", "\u00D0", "Eacute", "\u00C9",
+      "Ecirc", "\u00CA", "Egrave", "\u00C8", "Epsilon", "\u0395",
+      "Eta", "\u0397", "Euml", "\u00CB", "Gamma", "\u0393", "Iacute", "\u00CD",
+      "Icirc", "\u00CE", "Igrave", "\u00CC", "Iota", "\u0399",
+      "Iuml", "\u00CF", "Kappa", "\u039A", "Lambda", "\u039B", "Mu", "\u039C",
+      "Ntilde", "\u00D1", "Nu", "\u039D", "OElig", "\u0152",
+      "Oacute", "\u00D3", "Ocirc", "\u00D4", "Ograve", "\u00D2",
+      "Omega", "\u03A9", "Omicron", "\u039F", "Oslash", "\u00D8",
+      "Otilde", "\u00D5", "Ouml", "\u00D6", "Phi", "\u03A6", "Pi", "\u03A0",
+      "Prime", "\u2033", "Psi", "\u03A8", "Rho", "\u03A1", "Scaron", "\u0160",
+      "Sigma", "\u03A3", "THORN", "\u00DE", "Tau", "\u03A4", "Theta", "\u0398",
+      "Uacute", "\u00DA", "Ucirc", "\u00DB", "Ugrave", "\u00D9",
+      "Upsilon", "\u03A5", "Uuml", "\u00DC", "Xi", "\u039E",
+      "Yacute", "\u00DD", "Yuml", "\u0178", "Zeta", "\u0396",
+      "aacute", "\u00E1", "acirc", "\u00E2", "acute", "\u00B4",
+      "aelig", "\u00E6", "agrave", "\u00E0", "alefsym", "\u2135",
+      "alpha", "\u03B1", "amp", "\u0026", "and", "\u2227", "ang", "\u2220",
+      "apos", "\u0027", "aring", "\u00E5", "asymp", "\u2248",
+      "atilde", "\u00E3", "auml", "\u00E4", "bdquo", "\u201E",
+      "beta", "\u03B2", "brvbar", "\u00A6", "bull", "\u2022", "cap", "\u2229",
+      "ccedil", "\u00E7", "cedil", "\u00B8", "cent", "\u00A2", "chi", "\u03C7",
+      "circ", "\u02C6", "clubs", "\u2663", "cong", "\u2245", "copy", "\u00A9",
+      "crarr", "\u21B5", "cup", "\u222A", "curren", "\u00A4", "dArr", "\u21D3",
+      "dagger", "\u2020", "darr", "\u2193", "deg", "\u00B0", "delta", "\u03B4",
+      "diams", "\u2666", "divide", "\u00F7", "eacute", "\u00E9",
+      "ecirc", "\u00EA", "egrave", "\u00E8", "empty", "\u2205",
+      "emsp", "\u2003", "ensp", "\u2002", "epsilon", "\u03B5",
+      "equiv", "\u2261", "eta", "\u03B7", "eth", "\u00F0", "euml", "\u00EB",
+      "euro", "\u20AC", "exist", "\u2203", "fnof", "\u0192",
+      "forall", "\u2200", "frac12", "\u00BD", "frac14", "\u00BC",
+      "frac34", "\u00BE", "frasl", "\u2044", "gamma", "\u03B3", "ge", "\u2265",
+      "gt", "\u003E", "hArr", "\u21D4", "harr", "\u2194", "hearts", "\u2665",
+      "hellip", "\u2026", "iacute", "\u00ED", "icirc", "\u00EE",
+      "iexcl", "\u00A1", "igrave", "\u00EC", "image", "\u2111",
+      "infin", "\u221E", "int", "\u222B", "iota", "\u03B9", "iquest", "\u00BF",
+      "isin", "\u2208", "iuml", "\u00EF", "kappa", "\u03BA", "lArr", "\u21D0",
+      "lambda", "\u03BB", "lang", "\u2329", "laquo", "\u00AB",
+      "larr", "\u2190", "lceil", "\u2308", "ldquo", "\u201C", "le", "\u2264",
+      "lfloor", "\u230A", "lowast", "\u2217", "loz", "\u25CA", "lrm", "\u200E",
+      "lsaquo", "\u2039", "lsquo", "\u2018", "lt", "\u003C", "macr", "\u00AF",
+      "mdash", "\u2014", "micro", "\u00B5", "middot", "\u00B7",
+      "minus", "\u2212", "mu", "\u03BC", "nabla", "\u2207", "nbsp", " ",
+      "ndash", "\u2013", "ne", "\u2260", "ni", "\u220B", "not", "\u00AC",
+      "notin", "\u2209", "nsub", "\u2284", "ntilde", "\u00F1", "nu", "\u03BD",
+      "oacute", "\u00F3", "ocirc", "\u00F4", "oelig", "\u0153",
+      "ograve", "\u00F2", "oline", "\u203E", "omega", "\u03C9",
+      "omicron", "\u03BF", "oplus", "\u2295", "or", "\u2228", "ordf", "\u00AA",
+      "ordm", "\u00BA", "oslash", "\u00F8", "otilde", "\u00F5",
+      "otimes", "\u2297", "ouml", "\u00F6", "para", "\u00B6", "part", "\u2202",
+      "permil", "\u2030", "perp", "\u22A5", "phi", "\u03C6", "pi", "\u03C0",
+      "piv", "\u03D6", "plusmn", "\u00B1", "pound", "\u00A3",
+      "prime", "\u2032", "prod", "\u220F", "prop", "\u221D", "psi", "\u03C8",
+      "quot", "\"", "rArr", "\u21D2", "radic", "\u221A", "rang", "\u232A",
+      "raquo", "\u00BB", "rarr", "\u2192", "rceil", "\u2309",
+      "rdquo", "\u201D", "real", "\u211C", "reg", "\u00AE", "rfloor", "\u230B",
+      "rho", "\u03C1", "rlm", "\u200F", "rsaquo", "\u203A", "rsquo", "\u2019",
+      "sbquo", "\u201A", "scaron", "\u0161", "sdot", "\u22C5",
+      "sect", "\u00A7", "shy", "\u00AD", "sigma", "\u03C3", "sigmaf", "\u03C2",
+      "sim", "\u223C", "spades", "\u2660", "sub", "\u2282", "sube", "\u2286",
+      "sum", "\u2211", "sup", "\u2283", "sup1", "\u00B9", "sup2", "\u00B2",
+      "sup3", "\u00B3", "supe", "\u2287", "szlig", "\u00DF", "tau", "\u03C4",
+      "there4", "\u2234", "theta", "\u03B8", "thetasym", "\u03D1",
+      "thinsp", "\u2009", "thorn", "\u00FE", "tilde", "\u02DC",
+      "times", "\u00D7", "trade", "\u2122", "uArr", "\u21D1",
+      "uacute", "\u00FA", "uarr", "\u2191", "ucirc", "\u00FB",
+      "ugrave", "\u00F9", "uml", "\u00A8", "upsih", "\u03D2",
+      "upsilon", "\u03C5", "uuml", "\u00FC", "weierp", "\u2118",
+      "xi", "\u03BE", "yacute", "\u00FD", "yen", "\u00A5", "yuml", "\u00FF",
+      "zeta", "\u03B6", "zwj", "\u200D", "zwnj", "\u200C"
+    };
+    for (int i = 0 ; i < entities.length ; i += 2) {
+      Character value = entities[i + 1].charAt(0);
+      entityValues.put(entities[i], value);
+      String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);
+      if (upperCaseVariant != null) {
+        entityValues.put(upperCaseVariant, value);
+      }
+    }
+  }
+%}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
@ -0,0 +1,64 @@
+/*
+ * Copyright 2010 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Generated using ICU4J 49.1.0.0 on Sunday, July 15, 2012 5:42:00 AM UTC
+// by com.fr.third.org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros
+
+
+ID_Start_Supp = (
+	  [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDD80-\uDDB7\uDDBE\uDDBF\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
+	| [\uD81A][\uDC00-\uDE38]
+	| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
+	| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB]
+	| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
+	| [\uD82C][\uDC00\uDC01]
+	| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
+	| [\uD81B][\uDF00-\uDF44\uDF50\uDF93-\uDF9F]
+	| [\uD87E][\uDC00-\uDE1D]
+	| [\uD804][\uDC03-\uDC37\uDC83-\uDCAF\uDCD0-\uDCE8\uDD03-\uDD26\uDD83-\uDDB2\uDDC1-\uDDC4]
+	| [\uD83B][\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB]
+	| [\uD809][\uDC00-\uDC62]
+	| [\uD808][\uDC00-\uDF6E]
+	| [\uD803][\uDC00-\uDC48]
+	| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
+	| [\uD80D][\uDC00-\uDC2E]
+	| [\uD805][\uDE80-\uDEAA]
+	| [\uD86E][\uDC00-\uDC1D]
+	| [\uD801][\uDC00-\uDC9D]
+)
+ID_Continue_Supp = (
+	  [\uD81A][\uDC00-\uDE38]
+	| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
+	| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
+	| [\uD82C][\uDC00\uDC01]
+	| [\uD81B][\uDF00-\uDF44\uDF50-\uDF7E\uDF8F-\uDF9F]
+	| [\uD801][\uDC00-\uDC9D\uDCA0-\uDCA9]
+	| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
+	| [\uD87E][\uDC00-\uDE1D]
+	| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDD80-\uDDB7\uDDBE\uDDBF\uDE00-\uDE03\uDE05\uDE06\uDE0C-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE38-\uDE3A\uDE3F\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
+	| [\uD805][\uDE80-\uDEB7\uDEC0-\uDEC9]
+	| [\uD83B][\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB]
+	| [\uD809][\uDC00-\uDC62]
+	| [\uD808][\uDC00-\uDF6E]
+	| [\uD803][\uDC00-\uDC48]
+	| [\uD80D][\uDC00-\uDC2E]
+	| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDDFD\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
+	| [\uD804][\uDC00-\uDC46\uDC66-\uDC6F\uDC80-\uDCBA\uDCD0-\uDCE8\uDCF0-\uDCF9\uDD00-\uDD34\uDD36-\uDD3F\uDD80-\uDDC4\uDDD0-\uDDD9]
+	| [\uD86E][\uDC00-\uDC1D]
+	| [\uDB40][\uDD00-\uDDEF]
+	| [\uD834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44]
+	| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB\uDFCE-\uDFFF]
+)
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
@ -0,0 +1,919 @@
+package com.fr.third.org.apache.lucene.analysis.charfilter;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import com.fr.third.org.apache.lucene.util.Version;
+import com.fr.third.org.apache.lucene.analysis.util.CharArrayMap;
+import com.fr.third.org.apache.lucene.analysis.util.CharArraySet;
+import com.fr.third.org.apache.lucene.analysis.util.OpenStringBuilder;
+
+
+/**
+ * A CharFilter that wraps another Reader and attempts to strip out HTML constructs.
+ */
+@SuppressWarnings("fallthrough")
+%%
+
+%unicode 6.1
+%apiprivate
+%type int
+%final
+%public
+%char
+%function nextChar
+%class HTMLStripCharFilter
+%extends BaseCharFilter
+%xstate AMPERSAND, NUMERIC_CHARACTER, CHARACTER_REFERENCE_TAIL
+%xstate LEFT_ANGLE_BRACKET, BANG, COMMENT, SCRIPT, SCRIPT_COMMENT
+%xstate LEFT_ANGLE_BRACKET_SLASH, LEFT_ANGLE_BRACKET_SPACE, CDATA
+%xstate SERVER_SIDE_INCLUDE, SINGLE_QUOTED_STRING, DOUBLE_QUOTED_STRING
+%xstate END_TAG_TAIL_INCLUDE, END_TAG_TAIL_EXCLUDE, END_TAG_TAIL_SUBSTITUTE
+%xstate START_TAG_TAIL_INCLUDE, START_TAG_TAIL_EXCLUDE, START_TAG_TAIL_SUBSTITUTE
+%xstate STYLE, STYLE_COMMENT
+
+// From XML 1.0 <http://www.w3.org/TR/xml/>:
+//
+//    [4]  NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [...]
+//    [4a] NameChar      ::= NameStartChar | "-" | "." | [0-9] | [...]
+//    [5]  Name          ::= NameStartChar (NameChar)*
+//
+// From UAX #31: Unicode Identifier and Pattern Syntax
+// <http://unicode.org/reports/tr31/>:
+//
+//    D1. Default Identifier Syntax
+//
+//        <identifier> := <ID_Start> <ID_Continue>*
+//
+Name = ( ( [:_\p{ID_Start}] | {ID_Start_Supp} ) ( [-.:_\p{ID_Continue}] | {ID_Continue_Supp} )* )
+
+// From Apache httpd mod_include documentation
+// <http://httpd.apache.org/docs/current/mod/mod_include.html>:
+//
+// Basic Elements
+//
+//    The document is parsed as an HTML document, with special commands
+//    embedded as SGML comments. A command has the syntax:
+//
+//       <!--#element attribute=value attribute=value ... -->
+//
+//    The value will often be enclosed in double quotes, but single quotes (')
+//    and backticks (`) are also possible. Many commands only allow a single
+//    attribute-value pair. Note that the comment terminator (-->) should be
+//    preceded by whitespace to ensure that it isn't considered part of an SSI
+//    token. Note that the leading <!--# is one token and may not contain any
+//    whitespaces.
+//
+
+EventAttributeSuffixes = ( [aA][bB][oO][rR][tT]                 |
+                           [bB][lL][uU][rR]                     |
+                           [cC][hH][aA][nN][gG][eE]             |
+                           [cC][lL][iI][cC][kK]                 |
+                           [dD][bB][lL][cC][lL][iI][cC][kK]     |
+                           [eE][rR][rR][oO][rR]                 |
+                           [fF][oO][cC][uU][sS]                 |
+                           [kK][eE][yY][dD][oO][wW][nN]         |
+                           [kK][eE][yY][pP][rR][eE][sS][sS]     |
+                           [kK][eE][yY][uU][pP]                 |
+                           [lL][oO][aA][dD]                     |
+                           [mM][oO][uU][sS][eE][dD][oO][wW][nN] |
+                           [mM][oO][uU][sS][eE][mM][oO][vV][eE] |
+                           [mM][oO][uU][sS][eE][oO][uU][tT]     |
+                           [mM][oO][uU][sS][eE][oO][vV][eE][rR] |
+                           [mM][oO][uU][sS][eE][uU][pP]         |
+                           [rR][eE][sS][eE][tT]                 |
+                           [sS][eE][lL][eE][cC][tT]             |
+                           [sS][uU][bB][mM][iI][tT]             |
+                           [uU][nN][lL][oO][aA][dD]             )
+
+SingleQuoted = ( "'" ( "\\'" | [^']* )* "'" )
+DoubleQuoted = ( "\"" ( "\\\"" | [^\"]* )* "\"" )
+ServerSideInclude = ( "<!--#" ( [^'\"] | {SingleQuoted} | {DoubleQuoted} )* "-->" )
+EventAttribute = [oO][nN] {EventAttributeSuffixes} \s* "=" \s* ( {SingleQuoted} | {DoubleQuoted} )
+OpenTagContent = ( {EventAttribute} | [^<>] | {ServerSideInclude} )*
+
+InlineElment = ( [aAbBiIqQsSuU]                   |
+                 [aA][bB][bB][rR]                 |
+                 [aA][cC][rR][oO][nN][yY][mM]     |
+                 [bB][aA][sS][eE][fF][oO][nN][tT] |
+                 [bB][dD][oO]                     |
+                 [bB][iI][gG]                     |
+                 [cC][iI][tT][eE]                 |
+                 [cC][oO][dD][eE]                 |
+                 [dD][fF][nN]                     |
+                 [eE][mM]                         |
+                 [fF][oO][nN][tT]                 |
+                 [iI][mM][gG]                     |
+                 [iI][nN][pP][uU][tT]             |
+                 [kK][bB][dD]                     |
+                 [lL][aA][bB][eE][lL]             |
+                 [sS][aA][mM][pP]                 |
+                 [sS][eE][lL][eE][cC][tT]         |
+                 [sS][mM][aA][lL][lL]             |
+                 [sS][pP][aA][nN]                 |
+                 [sS][tT][rR][iI][kK][eE]         |
+                 [sS][tT][rR][oO][nN][gG]         |
+                 [sS][uU][bB]                     |
+                 [sS][uU][pP]                     |
+                 [tT][eE][xX][tT][aA][rR][eE][aA] |
+                 [tT][tT]                         |
+                 [vV][aA][rR]                     )
+
+
+%include HTMLCharacterEntities.jflex
+
+%include HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
+
+%{
+  private static final int INITIAL_INPUT_SEGMENT_SIZE = 1024;
+  private static final char BLOCK_LEVEL_START_TAG_REPLACEMENT = '\n';
+  private static final char BLOCK_LEVEL_END_TAG_REPLACEMENT = '\n';
+  private static final char BR_START_TAG_REPLACEMENT = '\n';
+  private static final char BR_END_TAG_REPLACEMENT = '\n';
+  private static final char SCRIPT_REPLACEMENT = '\n';
+  private static final char STYLE_REPLACEMENT = '\n';
+  private static final char REPLACEMENT_CHARACTER = '\uFFFD';
+
+  private CharArraySet escapedTags = null;
+  private int inputStart;
+  private int cumulativeDiff;
+  private boolean escapeBR = false;
+  private boolean escapeSCRIPT = false;
+  private boolean escapeSTYLE = false;
+  private int restoreState;
+  private int previousRestoreState;
+  private int outputCharCount;
+  private int eofReturnValue;
+  private TextSegment inputSegment
+      = new TextSegment(INITIAL_INPUT_SEGMENT_SIZE);
+  private TextSegment outputSegment = inputSegment;
+  private TextSegment entitySegment = new TextSegment(2);
+
+  /**
+   * Creates a new HTMLStripCharFilter over the provided Reader.
+   * @param source Reader to strip html tags from.
+   */
+  public HTMLStripCharFilter(Reader source) {
+    super(source);
+    this.zzReader = source;
+  }
+
+  /**
+   * Creates a new HTMLStripCharFilter over the provided Reader
+   * with the specified start and end tags.
+   * @param source Reader to strip html tags from.
+   * @param escapedTags Tags in this set (both start and end tags)
+   *  will not be filtered out.
+   */
+  public HTMLStripCharFilter(Reader source, Set<String> escapedTags) {
+    super(source);
+    this.zzReader = source;
+    if (null != escapedTags) {
+      for (String tag : escapedTags) {
+        if (tag.equalsIgnoreCase("BR")) {
+          escapeBR = true;
+        } else if (tag.equalsIgnoreCase("SCRIPT")) {
+          escapeSCRIPT = true;
+        } else if (tag.equalsIgnoreCase("STYLE")) {
+          escapeSTYLE = true;
+        } else {
+          if (null == this.escapedTags) {
+            this.escapedTags = new CharArraySet(Version.LUCENE_40, 16, true);
+          }
+          this.escapedTags.add(tag);
+        }
+      }
+    }
+  }
+
+  @Override
+  public int read() throws IOException {
+    if (outputSegment.isRead()) {
+      if (zzAtEOF) {
+        return -1;
+      }
+      int ch = nextChar();
+      ++outputCharCount;
+      return ch;
+    }
+    int ch = outputSegment.nextChar();
+    ++outputCharCount;
+    return ch;
+  }
+
+  @Override
+  public int read(char cbuf[], int off, int len) throws IOException {
+    int i = 0;
+    for ( ; i < len ; ++i) {
+      int ch = read();
+      if (ch == -1) break;
+      cbuf[off++] = (char)ch;
+    }
+    return i > 0 ? i : (len == 0 ? 0 : -1);
+  }
+
+  @Override
+  public void close() throws IOException {
+    yyclose();
+  }
+
+  static int getInitialBufferSize() {  // Package private, for testing purposes
+    return ZZ_BUFFERSIZE;
+  }
+
+  private class TextSegment extends OpenStringBuilder {
+    /** The position from which the next char will be read. */
+    int pos = 0;
+
+    /** Wraps the given buffer and sets this.len to the given length. */
+    TextSegment(char[] buffer, int length) {
+      super(buffer, length);
+    }
+
+    /** Allocates an internal buffer of the given size. */
+    TextSegment(int size) {
+      super(size);
+    }
+
+    /** Sets len = 0 and pos = 0. */
+    void clear() {
+      reset();
+      restart();
+    }
+
+    /** Sets pos = 0 */
+    void restart() {
+      pos = 0;
+    }
+
+    /** Returns the next char in the segment. */
+    int nextChar() {
+      assert (! isRead()): "Attempting to read past the end of a segment.";
+      return buf[pos++];
+    }
+
+    /** Returns true when all characters in the text segment have been read */
+    boolean isRead() {
+      return pos >= len;
+    }
+  }
+%}
+
+%eofval{
+  return eofReturnValue;
+%eofval}
+%eof{
+  switch (zzLexicalState) {
+    case SCRIPT:
+    case COMMENT:
+    case SCRIPT_COMMENT:
+    case STYLE:
+    case STYLE_COMMENT:
+    case SINGLE_QUOTED_STRING:
+    case DOUBLE_QUOTED_STRING:
+    case END_TAG_TAIL_EXCLUDE:
+    case END_TAG_TAIL_SUBSTITUTE:
+    case START_TAG_TAIL_EXCLUDE:
+    case SERVER_SIDE_INCLUDE:
+    case START_TAG_TAIL_SUBSTITUTE: { // Exclude
+      // add (length of input that won't be output) [ - (substitution length) = 0 ]
+      cumulativeDiff += yychar - inputStart;
+      // position the correction at (already output length) [ + (substitution length) = 0 ]
+      addOffCorrectMap(outputCharCount, cumulativeDiff);
+      outputSegment.clear();
+      eofReturnValue = -1;
+      break;
+    }
+    case CHARACTER_REFERENCE_TAIL: {        // Substitute
+      // At end of file, allow char refs without semicolons
+      // add (length of input that won't be output) - (substitution length)
+      cumulativeDiff += inputSegment.length() - outputSegment.length();
+      // position the correction at (already output length) + (substitution length)
+      addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
+      eofReturnValue = outputSegment.nextChar();
+      break;
+    }
+    case BANG:
+    case CDATA:
+    case AMPERSAND:
+    case NUMERIC_CHARACTER:
+    case END_TAG_TAIL_INCLUDE:
+    case START_TAG_TAIL_INCLUDE:
+    case LEFT_ANGLE_BRACKET:
+    case LEFT_ANGLE_BRACKET_SLASH:
+    case LEFT_ANGLE_BRACKET_SPACE: {        // Include
+      outputSegment = inputSegment;
+      eofReturnValue = outputSegment.nextChar();
+      break;
+    }
+    default: {
+      eofReturnValue = -1;
+    }
+  }
+%eof}
+
+%%
+
+"&" {
+  inputStart = yychar;
+  inputSegment.clear();
+  inputSegment.append('&');
+  yybegin(AMPERSAND);
+}
+
+"<" {
+  inputStart = yychar;
+  inputSegment.clear();
+  inputSegment.append('<');
+  yybegin(LEFT_ANGLE_BRACKET);
+}
+
+<AMPERSAND> {
+  {CharacterEntities} {
+    int length = yylength();
+    inputSegment.write(zzBuffer, zzStartRead, length);
+    entitySegment.clear();
+    char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue();
+    entitySegment.append(ch);
+    outputSegment = entitySegment;
+    yybegin(CHARACTER_REFERENCE_TAIL);
+  }
+  "#" { inputSegment.append('#'); yybegin(NUMERIC_CHARACTER); }
+
+//                                             1   1       11              11
+// 0  1   2   3       45              678  9   0   1       23              45
+  "#" [xX][dD][89aAbB][0-9a-fA-F]{2} ";&#" [xX][dD][c-fC-F][0-9a-fA-F]{2} ";" {
+    // Handle paired UTF-16 surrogates.
+    outputSegment = entitySegment;
+    outputSegment.clear();
+    String surrogatePair = yytext();
+    char highSurrogate = '\u0000';
+    try {
+      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing high surrogate '"
+                  + surrogatePair.substring(2, 6) + "'";
+    }
+    try {
+      outputSegment.unsafeWrite
+          ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing low surrogate '"
+                  + surrogatePair.substring(10, 14) + "'";
+    }
+    // add (previously matched input length) + (this match length) - (substitution length)
+    cumulativeDiff += inputSegment.length() + yylength() - 2;
+    // position the correction at (already output length) + (substitution length)
+    addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    return highSurrogate;
+  }
+
+//                          1   1       11              11
+// 01  2    345    678  9   0   1       23              45
+  "#5" [56] \d{3} ";&#" [xX][dD][c-fC-F][0-9a-fA-F]{2} ";" {
+    // Handle paired UTF-16 surrogates.
+    String surrogatePair = yytext();
+    char highSurrogate = '\u0000';
+    try { // High surrogates are in decimal range [55296, 56319]
+      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing high surrogate '"
+                  + surrogatePair.substring(1, 6) + "'";
+    }
+    if (Character.isHighSurrogate(highSurrogate)) {
+      outputSegment = entitySegment;
+      outputSegment.clear();
+      try {
+        outputSegment.unsafeWrite
+            ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
+      } catch(Exception e) { // should never happen
+        assert false: "Exception parsing low surrogate '"
+                    + surrogatePair.substring(10, 14) + "'";
+      }
+      // add (previously matched input length) + (this match length) - (substitution length)
+      cumulativeDiff += inputSegment.length() + yylength() - 2;
+      // position the correction at (already output length) + (substitution length)
+      addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+      inputSegment.clear();
+      yybegin(YYINITIAL);
+      return highSurrogate;
+    }
+    yypushback(surrogatePair.length() - 1); // Consume only '#'
+    inputSegment.append('#');
+    yybegin(NUMERIC_CHARACTER);
+  }
+
+//                                          1    111     11
+// 0  1   2   3       45              6789  0    123     45
+  "#" [xX][dD][89aAbB][0-9a-fA-F]{2} ";&#5" [67] \d{3}  ";" {
+    // Handle paired UTF-16 surrogates.
+    String surrogatePair = yytext();
+    char highSurrogate = '\u0000';
+    char lowSurrogate = '\u0000';
+    try {
+      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing high surrogate '"
+                  + surrogatePair.substring(2, 6) + "'";
+    }
+    try { // Low surrogates are in decimal range [56320, 57343]
+      lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing low surrogate '"
+                  + surrogatePair.substring(9, 14) + "'";
+    }
+    if (Character.isLowSurrogate(lowSurrogate)) {
+      outputSegment = entitySegment;
+      outputSegment.clear();
+      outputSegment.unsafeWrite(lowSurrogate);
+      // add (previously matched input length) + (this match length) - (substitution length)
+      cumulativeDiff += inputSegment.length() + yylength() - 2;
+      // position the correction at (already output length) + (substitution length)
+      addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+      inputSegment.clear();
+      yybegin(YYINITIAL);
+      return highSurrogate;
+    }
+    yypushback(surrogatePair.length() - 1); // Consume only '#'
+    inputSegment.append('#');
+    yybegin(NUMERIC_CHARACTER);
+  }
+
+//                       1    111     11
+// 01  2    345    6789  0    123     45
+  "#5" [56] \d{3} ";&#5" [67] \d{3}  ";" {
+    // Handle paired UTF-16 surrogates.
+    String surrogatePair = yytext();
+    char highSurrogate = '\u0000';
+    try { // High surrogates are in decimal range [55296, 56319]
+      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing high surrogate '"
+                  + surrogatePair.substring(1, 6) + "'";
+    }
+    if (Character.isHighSurrogate(highSurrogate)) {
+      char lowSurrogate = '\u0000';
+      try { // Low surrogates are in decimal range [56320, 57343]
+        lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
+      } catch(Exception e) { // should never happen
+        assert false: "Exception parsing low surrogate '"
+                    + surrogatePair.substring(9, 14) + "'";
+      }
+      if (Character.isLowSurrogate(lowSurrogate)) {
+        outputSegment = entitySegment;
+        outputSegment.clear();
+        outputSegment.unsafeWrite(lowSurrogate);
+        // add (previously matched input length) + (this match length) - (substitution length)
+        cumulativeDiff += inputSegment.length() + yylength() - 2;
+        // position the correction at (already output length) + (substitution length)
+        addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+        inputSegment.clear();
+        yybegin(YYINITIAL);
+        return highSurrogate;
+      }
+    }
+    yypushback(surrogatePair.length() - 1); // Consume only '#'
+    inputSegment.append('#');
+    yybegin(NUMERIC_CHARACTER);
+  }
+}
+
+<NUMERIC_CHARACTER> {
+  [xX] [0-9A-Fa-f]+ {
+    int matchLength = yylength();
+    inputSegment.write(zzBuffer, zzStartRead, matchLength);
+    if (matchLength <= 6) { // 10FFFF: max 6 hex chars
+      String hexCharRef
+          = new String(zzBuffer, zzStartRead + 1, matchLength - 1);
+      int codePoint = 0;
+      try {
+        codePoint = Integer.parseInt(hexCharRef, 16);
+      } catch(Exception e) {
+        assert false: "Exception parsing hex code point '" + hexCharRef + "'";
+      }
+      if (codePoint <= 0x10FFFF) {
+        outputSegment = entitySegment;
+        outputSegment.clear();
+        if (codePoint >= Character.MIN_SURROGATE
+            && codePoint <= Character.MAX_SURROGATE) {
+          outputSegment.unsafeWrite(REPLACEMENT_CHARACTER);
+        } else {
+          outputSegment.setLength
+              (Character.toChars(codePoint, outputSegment.getArray(), 0));
+        }
+        yybegin(CHARACTER_REFERENCE_TAIL);
+      } else {
+        outputSegment = inputSegment;
+        yybegin(YYINITIAL);
+        return outputSegment.nextChar();
+      }
+    } else {
+      outputSegment = inputSegment;
+      yybegin(YYINITIAL);
+      return outputSegment.nextChar();
+    }
+  }
+  [0-9]+ {
+    int matchLength = yylength();
+    inputSegment.write(zzBuffer, zzStartRead, matchLength);
+    if (matchLength <= 7) { // 0x10FFFF = 1114111: max 7 decimal chars
+      String decimalCharRef = yytext();
+      int codePoint = 0;
+      try {
+        codePoint = Integer.parseInt(decimalCharRef);
+      } catch(Exception e) {
+        assert false: "Exception parsing code point '" + decimalCharRef + "'";
+      }
+      if (codePoint <= 0x10FFFF) {
+        outputSegment = entitySegment;
+        outputSegment.clear();
+        if (codePoint >= Character.MIN_SURROGATE
+            && codePoint <= Character.MAX_SURROGATE) {
+          outputSegment.unsafeWrite(REPLACEMENT_CHARACTER);
+        } else {
+          outputSegment.setLength
+              (Character.toChars(codePoint, outputSegment.getArray(), 0));
+        }
+        yybegin(CHARACTER_REFERENCE_TAIL);
+      } else {
+        outputSegment = inputSegment;
+        yybegin(YYINITIAL);
+        return outputSegment.nextChar();
+      }
+    } else {
+      outputSegment = inputSegment;
+      yybegin(YYINITIAL);
+      return outputSegment.nextChar();
+    }
+  }
+}
+
+<CHARACTER_REFERENCE_TAIL> {
+  ";" {
+    // add (previously matched input length) + (this match length) - (substitution length)
+    cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
+    // position the correction at (already output length) + (substitution length)
+    addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
+    yybegin(YYINITIAL);
+    return outputSegment.nextChar();
+  }
+}
+
+<LEFT_ANGLE_BRACKET_SLASH> {
+  \s+ { inputSegment.write(zzBuffer, zzStartRead, yylength()); }
+  [bB][rR] \s* ">" {
+    yybegin(YYINITIAL);
+    if (escapeBR) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      return outputSegment.nextChar();
+    } else {
+      // add (previously matched input length) + (this match length) - (substitution length)
+      cumulativeDiff += inputSegment.length() + yylength() - 1;
+      // position the correction at (already output length) + (substitution length)
+      addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+      inputSegment.reset();
+      return BR_END_TAG_REPLACEMENT;
+    }
+  }
+  {InlineElment} {
+    inputSegment.write(zzBuffer, zzStartRead, yylength());
+    if (null != escapedTags
+        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
+      yybegin(END_TAG_TAIL_INCLUDE);
+    } else {
+      yybegin(END_TAG_TAIL_EXCLUDE);
+    }
+  }
+  {Name} {
+    inputSegment.write(zzBuffer, zzStartRead, yylength());
+    if (null != escapedTags
+        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
+      yybegin(END_TAG_TAIL_INCLUDE);
+    } else {
+      yybegin(END_TAG_TAIL_SUBSTITUTE);
+    }
+  }
+}
+
+<END_TAG_TAIL_INCLUDE> {
+   \s* ">" {
+     inputSegment.write(zzBuffer, zzStartRead, yylength());
+     outputSegment = inputSegment;
+     yybegin(YYINITIAL);
+     return outputSegment.nextChar();
+   }
+}
+
+<END_TAG_TAIL_EXCLUDE> {
+  \s* ">" {
+    // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
+    cumulativeDiff += inputSegment.length() + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+  }
+}
+
+<END_TAG_TAIL_SUBSTITUTE> {
+  \s* ">" {
+    // add (previously matched input length) + (this match length) - (substitution length)
+    cumulativeDiff += inputSegment.length() + yylength() - 1;
+    // position the correction at (already output length) + (substitution length)
+    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    return BLOCK_LEVEL_END_TAG_REPLACEMENT;
+  }
+}
+
+<LEFT_ANGLE_BRACKET> {
+  "!" { inputSegment.append('!'); yybegin(BANG); }
+  "/" { inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH); }
+  \s+ {
+    inputSegment.write(zzBuffer, zzStartRead, yylength());
+    yybegin(LEFT_ANGLE_BRACKET_SPACE);
+  }
+  "?" [^>]* [/?] ">" {
+    // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
+    cumulativeDiff += inputSegment.length() + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+  }
+  \s* [bB][rR] ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
+    yybegin(YYINITIAL);
+    if (escapeBR) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      return outputSegment.nextChar();
+    } else {
+      // add (previously matched input length) + (this match length) - (substitution length)
+      cumulativeDiff += inputSegment.length() + yylength() - 1;
+      // position the correction at (already output length) + (substitution length)
+      addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+      inputSegment.reset();
+      return BR_START_TAG_REPLACEMENT;
+    }
+  }
+  \s* [sS][cC][rR][iI][pP][tT] ( \s+ {OpenTagContent} )? \s*  ">" {
+    yybegin(SCRIPT);
+    if (escapeSCRIPT) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      inputStart += 1 + yylength();
+      return outputSegment.nextChar();
+    }
+  }
+  \s* [sS][tT][yY][lL][eE] ( \s+ {OpenTagContent} )? \s* ">" {
+    yybegin(STYLE);
+    if (escapeSTYLE) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      inputStart += 1 + yylength();
+      return outputSegment.nextChar();
+    }
+  }
+}
+
+<LEFT_ANGLE_BRACKET, LEFT_ANGLE_BRACKET_SPACE> {
+  {InlineElment} {
+    inputSegment.write(zzBuffer, zzStartRead, yylength());
+    if (null != escapedTags
+        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
+      yybegin(START_TAG_TAIL_INCLUDE);
+    } else {
+      yybegin(START_TAG_TAIL_EXCLUDE);
+    }
+  }
+  {Name} {
+    inputSegment.write(zzBuffer, zzStartRead, yylength());
+    if (null != escapedTags
+        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
+      yybegin(START_TAG_TAIL_INCLUDE);
+    } else {
+      yybegin(START_TAG_TAIL_SUBSTITUTE);
+    }
+  }
+}
+
+<START_TAG_TAIL_INCLUDE> {
+   ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
+     inputSegment.write(zzBuffer, zzStartRead, yylength());
+     outputSegment = inputSegment;
+     yybegin(YYINITIAL);
+     return outputSegment.nextChar();
+   }
+}
+
+<START_TAG_TAIL_EXCLUDE> {
+   ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
+    // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
+    cumulativeDiff += inputSegment.length() + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    outputSegment = inputSegment;
+    yybegin(YYINITIAL);
+  }
+}
+
+<START_TAG_TAIL_SUBSTITUTE> {
+  ( ( "="\s* | \s+ ) {OpenTagContent} )? \s*  "/"? ">" {
+    // add (previously matched input length) + (this match length) - (substitution length)
+    cumulativeDiff += inputSegment.length() + yylength() - 1;
+    // position the correction at (already output length) + (substitution length)
+    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    return BLOCK_LEVEL_START_TAG_REPLACEMENT;
+  }
+}
+
+<BANG> {
+  "--" { yybegin(COMMENT); }
+  ">" {
+    // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
+    cumulativeDiff += inputSegment.length() + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+  }
+  // From XML 1.0 <http://www.w3.org/TR/xml/>:
+  //
+  // [18] CDSect  ::= CDStart CData CDEnd
+  // [19] CDStart ::= '<![CDATA['
+  // [20] CData   ::= (Char* - (Char* ']]>' Char*))
+  // [21] CDEnd   ::= ']]>'
+  //
+  "[CDATA[" {
+    // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
+    cumulativeDiff += inputSegment.length() + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(CDATA);
+  }
+  [^] {
+    inputSegment.append(zzBuffer[zzStartRead]);
+  }
+}
+
+<CDATA> {
+  "]]>" {
+    // add (this match length) [ - (substitution length) = 0 ]
+    cumulativeDiff += yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0 ]
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    yybegin(YYINITIAL);
+  }
+  [^] { return zzBuffer[zzStartRead]; }
+}
+
+<COMMENT> {
+  "<!--#" { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
+  "-->" {
+    // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
+    cumulativeDiff += yychar - inputStart + yylength();
+    // position the correction at (already output length) [ + (substitution length) = 0]
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+  }
+  [^] { }
+}
+
+<SERVER_SIDE_INCLUDE> {
+  "-->" { yybegin(restoreState); }
+  "'" {
+    previousRestoreState = restoreState;
+    restoreState = SERVER_SIDE_INCLUDE;
+    yybegin(SINGLE_QUOTED_STRING);
+  }
+  "\"" {
+    previousRestoreState = restoreState;
+    restoreState = SERVER_SIDE_INCLUDE;
+    yybegin(DOUBLE_QUOTED_STRING);
+  }
+  [^] { }
+}
+
+<SCRIPT_COMMENT> {
+  "<!--#" { restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
+  "'"     { restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING); }
+  "\""    { restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING); }
+  "-->"   { yybegin(SCRIPT); }
+  [^] { }
+}
+
+<STYLE_COMMENT> {
+  "<!--#" { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
+  "'"     { restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING); }
+  "\""    { restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING); }
+  "-->"   { yybegin(STYLE); }
+  [^] { }
+}
+
+<SINGLE_QUOTED_STRING> {
+  "\\" [^] { }
+  "'" { yybegin(restoreState); restoreState = previousRestoreState; }
+  [^] { }
+}
+
+<DOUBLE_QUOTED_STRING> {
+  "\\" [^] { }
+  "\"" { yybegin(restoreState); restoreState = previousRestoreState; }
+  [^] { }
+}
+
+<SCRIPT> {
+  "<!--" { yybegin(SCRIPT_COMMENT); }
+  "</" \s* [sS][cC][rR][iI][pP][tT] \s* ">" {
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    // add (previously matched input length) -- current match and substitution handled below
+    cumulativeDiff += yychar - inputStart;
+    // position at (already output length) -- substitution handled below
+    int offsetCorrectionPos = outputCharCount;
+    int returnValue;
+    if (escapeSCRIPT) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      returnValue = outputSegment.nextChar();
+    } else {
+      // add (this match length) - (substitution length)
+      cumulativeDiff += yylength() - 1;
+      // add (substitution length)
+      ++offsetCorrectionPos;
+      returnValue = SCRIPT_REPLACEMENT;
+    }
+    addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
+    return returnValue;
+  }
+  [^] { }
+}
+
+<STYLE> {
+  "<!--" { yybegin(STYLE_COMMENT); }
+  "</" \s* [sS][tT][yY][lL][eE] \s* ">" {
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    // add (previously matched input length) -- current match and substitution handled below
+    cumulativeDiff += yychar - inputStart;
+    // position the offset correction at (already output length) -- substitution handled below
+    int offsetCorrectionPos = outputCharCount;
+    int returnValue;
+    if (escapeSTYLE) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      returnValue = outputSegment.nextChar();
+    } else {
+      // add (this match length) - (substitution length)
+      cumulativeDiff += yylength() - 1;
+      // add (substitution length)
+      ++offsetCorrectionPos;
+      returnValue = STYLE_REPLACEMENT;
+    }
+    addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
+    return returnValue;
+  }
+  [^] { }
+}
+
+<AMPERSAND,NUMERIC_CHARACTER,CHARACTER_REFERENCE_TAIL,LEFT_ANGLE_BRACKET_SLASH,END_TAG_TAIL_INCLUDE,END_TAG_TAIL_EXCLUDE,END_TAG_TAIL_SUBSTITUTE,LEFT_ANGLE_BRACKET,LEFT_ANGLE_BRACKET_SPACE,START_TAG_TAIL_INCLUDE,START_TAG_TAIL_EXCLUDE,START_TAG_TAIL_SUBSTITUTE,BANG> {
+  [^] {
+    yypushback(1);
+    outputSegment = inputSegment;
+    outputSegment.restart();
+    yybegin(YYINITIAL);
+    return outputSegment.nextChar();
+  }
+}
+
+[^] { return zzBuffer[zzStartRead]; }
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterFactory.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterFactory.java
@ -0,0 +1,70 @@
+package com.fr.third.org.apache.lucene.analysis.charfilter;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.fr.third.org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
+import com.fr.third.org.apache.lucene.analysis.util.CharFilterFactory;
+
+import java.io.Reader;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+* Factory for {@link HTMLStripCharFilter}. 
+ * <pre class="prettyprint" >
+ * &lt;fieldType name="text_html" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;charFilter class="solr.HTMLStripCharFilterFactory" escapedTags="a, title" /&gt;
+ *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ *
+ */
+ public class HTMLStripCharFilterFactory extends CharFilterFactory {
+  
+  Set<String> escapedTags = null;
+  Pattern TAG_NAME_PATTERN = Pattern.compile("[^\\s,]+");
+
+  public HTMLStripCharFilter create(Reader input) {
+    HTMLStripCharFilter charFilter;
+    if (null == escapedTags) {
+      charFilter = new HTMLStripCharFilter(input);
+    } else {
+      charFilter = new HTMLStripCharFilter(input, escapedTags);
+    }
+    return charFilter;
+  }
+  
+  @Override
+  public void init(Map<String,String> args) {
+    super.init(args);
+    String escapedTagsArg = args.get("escapedTags");
+    if (null != escapedTagsArg) {
+      Matcher matcher = TAG_NAME_PATTERN.matcher(escapedTagsArg);
+      while (matcher.find()) {
+        if (null == escapedTags) {
+          escapedTags = new HashSet<String>();
+        }
+        escapedTags.add(matcher.group(0));
+      }
+    }
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/MappingCharFilter.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/MappingCharFilter.java
@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.fr.third.org.apache.lucene.analysis.charfilter;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Map;
+
+import com.fr.third.org.apache.lucene.analysis.CharFilter; // javadocs
+import com.fr.third.org.apache.lucene.analysis.util.RollingCharBuffer;
+import com.fr.third.org.apache.lucene.util.CharsRef;
+import com.fr.third.org.apache.lucene.util.fst.CharSequenceOutputs;
+import com.fr.third.org.apache.lucene.util.fst.FST;
+import com.fr.third.org.apache.lucene.util.fst.Outputs;
+
+/**
+ * Simplistic {@link CharFilter} that applies the mappings
+ * contained in a {@link NormalizeCharMap} to the character
+ * stream, and correcting the resulting changes to the
+ * offsets.  Matching is greedy (longest pattern matching at
+ * a given point wins).  Replacement is allowed to be the
+ * empty string.
+ */
+
+public class MappingCharFilter extends BaseCharFilter {
+
+  private final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
+  private final FST<CharsRef> map;
+  private final FST.BytesReader fstReader;
+  private final RollingCharBuffer buffer = new RollingCharBuffer();
+  private final FST.Arc<CharsRef> scratchArc = new FST.Arc<CharsRef>();
+  private final Map<Character,FST.Arc<CharsRef>> cachedRootArcs;
+
+  private CharsRef replacement;
+  private int replacementPointer;
+  private int inputOff;
+
+  /** Default constructor that takes a {@link Reader}. */
+  public MappingCharFilter(NormalizeCharMap normMap, Reader in) {
+    super(in);
+    buffer.reset(in);
+
+    map = normMap.map;
+    cachedRootArcs = normMap.cachedRootArcs;
+
+    if (map != null) {
+      fstReader = map.getBytesReader(0);
+    } else {
+      fstReader = null;
+    }
+  }
+
+  @Override
+  public void reset() throws IOException {
+    input.reset();
+    buffer.reset(input);
+    replacement = null;
+    inputOff = 0;
+  }
+
+  @Override
+  public int read() throws IOException {
+
+    //System.out.println("\nread");
+    while(true) {
+
+      if (replacement != null && replacementPointer < replacement.length) {
+        //System.out.println("  return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]);
+        return replacement.chars[replacement.offset + replacementPointer++];
+      }
+
+      // TODO: a more efficient approach would be Aho/Corasick's
+      // algorithm
+      // (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm)
+      // or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
+      //
+      // I think this would be (almost?) equivalent to 1) adding
+      // epsilon arcs from all final nodes back to the init
+      // node in the FST, 2) adding a .* (skip any char)
+      // loop on the initial node, and 3) determinizing
+      // that.  Then we would not have to restart matching
+      // at each position.
+
+      int lastMatchLen = -1;
+      CharsRef lastMatch = null;
+
+      final int firstCH = buffer.get(inputOff);
+      if (firstCH != -1) {
+        FST.Arc<CharsRef> arc = cachedRootArcs.get(Character.valueOf((char) firstCH));
+        if (arc != null) {
+          if (!FST.targetHasArcs(arc)) {
+            // Fast pass for single character match:
+            assert arc.isFinal();
+            lastMatchLen = 1;
+            lastMatch = arc.output;
+          } else {
+            int lookahead = 0;
+            CharsRef output = arc.output;
+            while (true) {
+              lookahead++;
+
+              if (arc.isFinal()) {
+                // Match! (to node is final)
+                lastMatchLen = lookahead;
+                lastMatch = outputs.add(output, arc.nextFinalOutput);
+                // Greedy: keep searching to see if there's a
+                // longer match...
+              }
+
+              if (!FST.targetHasArcs(arc)) {
+                break;
+              }
+
+              int ch = buffer.get(inputOff + lookahead);
+              if (ch == -1) {
+                break;
+              }
+              if ((arc = map.findTargetArc(ch, arc, scratchArc, fstReader)) == null) {
+                // Dead end
+                break;
+              }
+              output = outputs.add(output, arc.output);
+            }
+          }
+        }
+      }
+
+      if (lastMatch != null) {
+        inputOff += lastMatchLen;
+        //System.out.println("  match!  len=" + lastMatchLen + " repl=" + lastMatch);
+
+        final int diff = lastMatchLen - lastMatch.length;
+
+        if (diff != 0) {
+          final int prevCumulativeDiff = getLastCumulativeDiff();
+          if (diff > 0) {
+            // Replacement is shorter than matched input:
+            addOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff);
+          } else {
+            // Replacement is longer than matched input: remap
+            // the "extra" chars all back to the same input
+            // offset:
+            final int outputStart = inputOff - prevCumulativeDiff;
+            for(int extraIDX=0;extraIDX<-diff;extraIDX++) {
+              addOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1);
+            }
+          }
+        }
+
+        replacement = lastMatch;
+        replacementPointer = 0;
+
+      } else {
+        final int ret = buffer.get(inputOff);
+        if (ret != -1) {
+          inputOff++;
+          buffer.freeBefore(inputOff);
+        }
+        return ret;
+      }
+    }
+  }
+
+  @Override
+  public int read(char[] cbuf, int off, int len) throws IOException {
+    int numRead = 0;
+    for(int i = off; i < off + len; i++) {
+      int c = read();
+      if (c == -1) break;
+      cbuf[i] = (char) c;
+      numRead++;
+    }
+
+    return numRead == 0 ? -1 : numRead;
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/MappingCharFilterFactory.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/MappingCharFilterFactory.java
@ -0,0 +1,135 @@
+package com.fr.third.org.apache.lucene.analysis.charfilter;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.fr.third.org.apache.lucene.analysis.charfilter.MappingCharFilter;
+import com.fr.third.org.apache.lucene.analysis.charfilter.NormalizeCharMap;
+import com.fr.third.org.apache.lucene.analysis.util.*;
+
+/**
+ * Factory for {@link MappingCharFilter}. 
+ * <pre class="prettyprint" >
+ * &lt;fieldType name="text_map" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;charFilter class="solr.MappingCharFilterFactory" mapping="mapping.txt"/&gt;
+ *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ *
+ *
+ * @since Solr 1.4
+ *
+ */
+public class MappingCharFilterFactory extends CharFilterFactory implements
+    ResourceLoaderAware, MultiTermAwareComponent {
+
+  protected NormalizeCharMap normMap;
+  private String mapping;
+
+  // TODO: this should use inputstreams from the loader, not File!
+  public void inform(ResourceLoader loader) throws IOException {
+    mapping = args.get("mapping");
+
+    if (mapping != null) {
+      List<String> wlist = null;
+      File mappingFile = new File(mapping);
+      if (mappingFile.exists()) {
+        wlist = getLines(loader, mapping);
+      } else {
+        List<String> files = splitFileNames(mapping);
+        wlist = new ArrayList<String>();
+        for (String file : files) {
+          List<String> lines = getLines(loader, file.trim());
+          wlist.addAll(lines);
+        }
+      }
+      final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
+      parseRules(wlist, builder);
+      normMap = builder.build();
+      if (normMap.map == null) {
+        // if the inner FST is null, it means it accepts nothing (e.g. the file is empty)
+        // so just set the whole map to null
+        normMap = null;
+      }
+    }
+  }
+
+  public Reader create(Reader input) {
+    // if the map is null, it means there's actually no mappings... just return the original stream
+    // as there is nothing to do here.
+    return normMap == null ? input : new MappingCharFilter(normMap,input);
+  }
+
+  // "source" => "target"
+  static Pattern p = Pattern.compile( "\"(.*)\"\\s*=>\\s*\"(.*)\"\\s*$" );
+
+  protected void parseRules( List<String> rules, NormalizeCharMap.Builder builder ){
+    for( String rule : rules ){
+      Matcher m = p.matcher( rule );
+      if( !m.find() )
+        throw new IllegalArgumentException("Invalid Mapping Rule : [" + rule + "], file = " + mapping);
+      builder.add( parseString( m.group( 1 ) ), parseString( m.group( 2 ) ) );
+    }
+  }
+
+  char[] out = new char[256];
+  
+  protected String parseString( String s ){
+    int readPos = 0;
+    int len = s.length();
+    int writePos = 0;
+    while( readPos < len ){
+      char c = s.charAt( readPos++ );
+      if( c == '\\' ){
+        if( readPos >= len )
+          throw new IllegalArgumentException("Invalid escaped char in [" + s + "]");
+        c = s.charAt( readPos++ );
+        switch( c ) {
+          case '\\' : c = '\\'; break;
+          case '"' : c = '"'; break;
+          case 'n' : c = '\n'; break;
+          case 't' : c = '\t'; break;
+          case 'r' : c = '\r'; break;
+          case 'b' : c = '\b'; break;
+          case 'f' : c = '\f'; break;
+          case 'u' :
+            if( readPos + 3 >= len )
+              throw new IllegalArgumentException("Invalid escaped char in [" + s + "]");
+            c = (char)Integer.parseInt( s.substring( readPos, readPos + 4 ), 16 );
+            readPos += 4;
+            break;
+        }
+      }
+      out[writePos++] = c;
+    }
+    return new String( out, 0, writePos );
+  }
+
+  @Override
+  public AbstractAnalysisFactory getMultiTermComponent() {
+    return this;
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/NormalizeCharMap.java
@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.fr.third.org.apache.lucene.analysis.charfilter;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.TreeMap;
+
+import com.fr.third.org.apache.lucene.util.CharsRef;
+import com.fr.third.org.apache.lucene.util.IntsRef;
+import com.fr.third.org.apache.lucene.util.fst.Builder;
+import com.fr.third.org.apache.lucene.util.fst.CharSequenceOutputs;
+import com.fr.third.org.apache.lucene.util.fst.FST;
+import com.fr.third.org.apache.lucene.util.fst.Outputs;
+import com.fr.third.org.apache.lucene.util.fst.Util;
+
+// TODO: save/load?
+
+/**
+ * Holds a map of String input to String output, to be used
+ * with {@link MappingCharFilter}.  Use the {@link Builder}
+ * to create this.
+ */
+public class NormalizeCharMap {
+
+  final FST<CharsRef> map;
+  final Map<Character,FST.Arc<CharsRef>> cachedRootArcs = new HashMap<Character,FST.Arc<CharsRef>>();
+
+  // Use the builder to create:
+  private NormalizeCharMap(FST<CharsRef> map) {
+    this.map = map;
+    if (map != null) {
+      try {
+        // Pre-cache root arcs:
+        final FST.Arc<CharsRef> scratchArc = new FST.Arc<CharsRef>();
+        final FST.BytesReader fstReader = map.getBytesReader(0);
+        map.getFirstArc(scratchArc);
+        if (FST.targetHasArcs(scratchArc)) {
+          map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader);
+          while(true) {
+            assert scratchArc.label != FST.END_LABEL;
+            cachedRootArcs.put(Character.valueOf((char) scratchArc.label), new FST.Arc<CharsRef>().copyFrom(scratchArc));
+            if (scratchArc.isLast()) {
+              break;
+            }
+            map.readNextRealArc(scratchArc, fstReader);
+          }
+        }
+        //System.out.println("cached " + cachedRootArcs.size() + " root arcs");
+      } catch (IOException ioe) {
+        // Bogus FST IOExceptions!!  (will never happen)
+        throw new RuntimeException(ioe);
+      }
+    }
+  }
+
+  /**
+   * Builds an NormalizeCharMap.
+   * <p>
+   * Call add() until you have added all the mappings, then call build() to get a NormalizeCharMap
+   * @lucene.experimental
+   */
+  public static class Builder {
+
+    private final Map<String,String> pendingPairs = new TreeMap<String,String>();
+
+    /** Records a replacement to be applied to the input
+     *  stream.  Whenever <code>singleMatch</code> occurs in
+     *  the input, it will be replaced with
+     *  <code>replacement</code>.
+     *
+     * @param match input String to be replaced
+     * @param replacement output String
+     * @throws IllegalArgumentException if
+     * <code>match</code> is the empty string, or was
+     * already previously added
+     */
+    public void add(String match, String replacement) {
+      if (match.length() == 0 ){
+        throw new IllegalArgumentException("cannot match the empty string");
+      }
+      if (pendingPairs.containsKey(match)) {
+        throw new IllegalArgumentException("match \"" + match + "\" was already added");
+      }
+      pendingPairs.put(match, replacement);
+    }
+
+    /** Builds the NormalizeCharMap; call this once you
+     *  are done calling {@link #add}. */
+    public NormalizeCharMap build() {
+
+      final FST<CharsRef> map;
+      try {
+        final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
+        final com.fr.third.org.apache.lucene.util.fst.Builder<CharsRef> builder = new com.fr.third.org.apache.lucene.util.fst.Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs);
+        final IntsRef scratch = new IntsRef();
+        for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
+          builder.add(Util.toUTF16(ent.getKey(), scratch),
+                      new CharsRef(ent.getValue()));
+        }
+        map = builder.finish();
+        pendingPairs.clear();
+      } catch (IOException ioe) {
+        // Bogus FST IOExceptions!!  (will never happen)
+        throw new RuntimeException(ioe);
+      }
+
+      return new NormalizeCharMap(map);
+    }
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/htmlentity.py
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/htmlentity.py
@ -0,0 +1,539 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+# A simple python script to generate an HTML entity map and a regex alternation
+# for inclusion in HTMLStripCharFilter.jflex.
+
+def main():
+  print get_apache_license()
+  codes = {}
+  regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
+  for line in get_entity_text().split('\n'):
+    match = regex.match(line)
+    if match:
+      key = match.group(1)
+      if   key == 'quot': codes[key] = r'\"'
+      elif key == 'nbsp': codes[key] = ' ';
+      else              : codes[key] = r'\u%04X' % int(match.group(2))
+
+  keys = sorted(codes)
+
+  first_entry = True
+  output_line = 'CharacterEntities = ( '
+  for key in keys:
+    new_entry = ('"%s"' if first_entry else ' | "%s"') % key
+    first_entry = False
+    if len(output_line) + len(new_entry) >= 80:
+      print output_line
+      output_line = '                   '
+    output_line += new_entry
+    if key in ('quot','copy','gt','lt','reg','amp'):
+      new_entry = ' | "%s"' % key.upper()
+      if len(output_line) + len(new_entry) >= 80:
+        print output_line
+        output_line = '                   '
+      output_line += new_entry
+  print output_line, ')'
+
+  print '%{'
+  print '  private static final Map<String,String> upperCaseVariantsAccepted'
+  print '      = new HashMap<String,String>();'
+  print '  static {'
+  print '    upperCaseVariantsAccepted.put("quot", "QUOT");'
+  print '    upperCaseVariantsAccepted.put("copy", "COPY");'
+  print '    upperCaseVariantsAccepted.put("gt", "GT");'
+  print '    upperCaseVariantsAccepted.put("lt", "LT");'
+  print '    upperCaseVariantsAccepted.put("reg", "REG");'
+  print '    upperCaseVariantsAccepted.put("amp", "AMP");'
+  print '  }'
+  print '  private static final CharArrayMap<Character> entityValues'
+  print '      = new CharArrayMap<Character>(Version.LUCENE_40, %i, false);' % len(keys)
+  print '  static {'
+  print '    String[] entities = {'
+  output_line = '     '
+  for key in keys:
+    new_entry = ' "%s", "%s",' % (key, codes[key])
+    if len(output_line) + len(new_entry) >= 80:
+      print output_line
+      output_line = '     '
+    output_line += new_entry
+  print output_line[:-1]
+  print '    };'
+  print '    for (int i = 0 ; i < entities.length ; i += 2) {'
+  print '      Character value = entities[i + 1].charAt(0);'
+  print '      entityValues.put(entities[i], value);'
+  print '      String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);'
+  print '      if (upperCaseVariant != null) {'
+  print '        entityValues.put(upperCaseVariant, value);'
+  print '      }'
+  print '    }'
+  print "  }"
+  print "%}"
+
+def get_entity_text():
+# The text below is taken verbatim from
+# <http://www.w3.org/TR/REC-html40/sgml/entities.html>:
+  text = r"""
+F.1. XHTML Character Entities
+
+XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section.
+F.1.1. XHTML Latin 1 Character Entities
+
+You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-lat1.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent.
+
+<!-- ...................................................................... -->
+<!-- XML-compatible ISO Latin 1 Character Entity Set for XHTML ............ -->
+<!-- file: xhtml-lat1.ent
+
+     Typical invocation:
+
+       <!ENTITY % xhtml-lat1
+           PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
+                  "xhtml-lat1.ent" >
+       %xhtml-lat1;
+
+     This DTD module is identified by the PUBLIC and SYSTEM identifiers:
+
+       PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
+       SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent"
+
+     Revision:  $Id: xhtml-lat1.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
+
+     Portions (C) International Organization for Standardization 1986:
+     Permission to copy in any form is granted for use with conforming
+     SGML systems and applications as defined in ISO 8879, provided
+     this notice is included in all copies.
+-->
+
+<!ENTITY nbsp   "&#160;" ><!-- no-break space = non-breaking space, U+00A0 ISOnum -->
+<!ENTITY iexcl  "&#161;" ><!-- inverted exclamation mark, U+00A1 ISOnum -->
+<!ENTITY cent   "&#162;" ><!-- cent sign, U+00A2 ISOnum -->
+<!ENTITY pound  "&#163;" ><!-- pound sign, U+00A3 ISOnum -->
+<!ENTITY curren "&#164;" ><!-- currency sign, U+00A4 ISOnum -->
+<!ENTITY yen    "&#165;" ><!-- yen sign = yuan sign, U+00A5 ISOnum -->
+<!ENTITY brvbar "&#166;" ><!-- broken bar = broken vertical bar, U+00A6 ISOnum -->
+<!ENTITY sect   "&#167;" ><!-- section sign, U+00A7 ISOnum -->
+<!ENTITY uml    "&#168;" ><!-- diaeresis = spacing diaeresis, U+00A8 ISOdia -->
+<!ENTITY copy   "&#169;" ><!-- copyright sign, U+00A9 ISOnum -->
+<!ENTITY ordf   "&#170;" ><!-- feminine ordinal indicator, U+00AA ISOnum -->
+<!ENTITY laquo  "&#171;" ><!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum -->
+<!ENTITY not    "&#172;" ><!-- not sign, U+00AC ISOnum -->
+<!ENTITY shy    "&#173;" ><!-- soft hyphen = discretionary hyphen, U+00AD ISOnum -->
+<!ENTITY reg    "&#174;" ><!-- registered sign = registered trade mark sign, U+00AE ISOnum -->
+<!ENTITY macr   "&#175;" ><!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia -->
+<!ENTITY deg    "&#176;" ><!-- degree sign, U+00B0 ISOnum -->
+<!ENTITY plusmn "&#177;" ><!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum -->
+<!ENTITY sup2   "&#178;" ><!-- superscript two = superscript digit two = squared, U+00B2 ISOnum -->
+<!ENTITY sup3   "&#179;" ><!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum -->
+<!ENTITY acute  "&#180;" ><!-- acute accent = spacing acute, U+00B4 ISOdia -->
+<!ENTITY micro  "&#181;" ><!-- micro sign, U+00B5 ISOnum -->
+<!ENTITY para   "&#182;" ><!-- pilcrow sign = paragraph sign, U+00B6 ISOnum -->
+<!ENTITY middot "&#183;" ><!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum -->
+<!ENTITY cedil  "&#184;" ><!-- cedilla = spacing cedilla, U+00B8 ISOdia -->
+<!ENTITY sup1   "&#185;" ><!-- superscript one = superscript digit one, U+00B9 ISOnum -->
+<!ENTITY ordm   "&#186;" ><!-- masculine ordinal indicator, U+00BA ISOnum -->
+<!ENTITY raquo  "&#187;" ><!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum -->
+<!ENTITY frac14 "&#188;" ><!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum -->
+<!ENTITY frac12 "&#189;" ><!-- vulgar fraction one half = fraction one half, U+00BD ISOnum -->
+<!ENTITY frac34 "&#190;" ><!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum -->
+<!ENTITY iquest "&#191;" ><!-- inverted question mark = turned question mark, U+00BF ISOnum -->
+<!ENTITY Agrave "&#192;" ><!-- latin capital A with grave = latin capital A grave, U+00C0 ISOlat1 -->
+<!ENTITY Aacute "&#193;" ><!-- latin capital A with acute, U+00C1 ISOlat1 -->
+<!ENTITY Acirc  "&#194;" ><!-- latin capital A with circumflex, U+00C2 ISOlat1 -->
+<!ENTITY Atilde "&#195;" ><!-- latin capital A with tilde, U+00C3 ISOlat1 -->
+<!ENTITY Auml   "&#196;" ><!-- latin capital A with diaeresis, U+00C4 ISOlat1 -->
+<!ENTITY Aring  "&#197;" ><!-- latin capital A with ring above = latin capital A ring, U+00C5 ISOlat1 -->
+<!ENTITY AElig  "&#198;" ><!-- latin capital AE = latin capital ligature AE, U+00C6 ISOlat1 -->
+<!ENTITY Ccedil "&#199;" ><!-- latin capital C with cedilla, U+00C7 ISOlat1 -->
+<!ENTITY Egrave "&#200;" ><!-- latin capital E with grave, U+00C8 ISOlat1 -->
+<!ENTITY Eacute "&#201;" ><!-- latin capital E with acute, U+00C9 ISOlat1 -->
+<!ENTITY Ecirc  "&#202;" ><!-- latin capital E with circumflex, U+00CA ISOlat1 -->
+<!ENTITY Euml   "&#203;" ><!-- latin capital E with diaeresis, U+00CB ISOlat1 -->
+<!ENTITY Igrave "&#204;" ><!-- latin capital I with grave, U+00CC ISOlat1 -->
+<!ENTITY Iacute "&#205;" ><!-- latin capital I with acute, U+00CD ISOlat1 -->
+<!ENTITY Icirc  "&#206;" ><!-- latin capital I with circumflex, U+00CE ISOlat1 -->
+<!ENTITY Iuml   "&#207;" ><!-- latin capital I with diaeresis, U+00CF ISOlat1 -->
+<!ENTITY ETH    "&#208;" ><!-- latin capital ETH, U+00D0 ISOlat1 -->
+<!ENTITY Ntilde "&#209;" ><!-- latin capital N with tilde, U+00D1 ISOlat1 -->
+<!ENTITY Ograve "&#210;" ><!-- latin capital O with grave, U+00D2 ISOlat1 -->
+<!ENTITY Oacute "&#211;" ><!-- latin capital O with acute, U+00D3 ISOlat1 -->
+<!ENTITY Ocirc  "&#212;" ><!-- latin capital O with circumflex, U+00D4 ISOlat1 -->
+<!ENTITY Otilde "&#213;" ><!-- latin capital O with tilde, U+00D5 ISOlat1 -->
+<!ENTITY Ouml   "&#214;" ><!-- latin capital O with diaeresis, U+00D6 ISOlat1 -->
+<!ENTITY times  "&#215;" ><!-- multiplication sign, U+00D7 ISOnum -->
+<!ENTITY Oslash "&#216;" ><!-- latin capital O with stroke = latin capital O slash, U+00D8 ISOlat1 -->
+<!ENTITY Ugrave "&#217;" ><!-- latin capital U with grave, U+00D9 ISOlat1 -->
+<!ENTITY Uacute "&#218;" ><!-- latin capital U with acute, U+00DA ISOlat1 -->
+<!ENTITY Ucirc  "&#219;" ><!-- latin capital U with circumflex, U+00DB ISOlat1 -->
+<!ENTITY Uuml   "&#220;" ><!-- latin capital U with diaeresis, U+00DC ISOlat1 -->
+<!ENTITY Yacute "&#221;" ><!-- latin capital Y with acute, U+00DD ISOlat1 -->
+<!ENTITY THORN  "&#222;" ><!-- latin capital THORN, U+00DE ISOlat1 -->
+<!ENTITY szlig  "&#223;" ><!-- latin small sharp s = ess-zed, U+00DF ISOlat1 -->
+<!ENTITY agrave "&#224;" ><!-- latin small a with grave = latin small a grave, U+00E0 ISOlat1 -->
+<!ENTITY aacute "&#225;" ><!-- latin small a with acute, U+00E1 ISOlat1 -->
+<!ENTITY acirc  "&#226;" ><!-- latin small a with circumflex, U+00E2 ISOlat1 -->
+<!ENTITY atilde "&#227;" ><!-- latin small a with tilde, U+00E3 ISOlat1 -->
+<!ENTITY auml   "&#228;" ><!-- latin small a with diaeresis, U+00E4 ISOlat1 -->
+<!ENTITY aring  "&#229;" ><!-- latin small a with ring above = latin small a ring, U+00E5 ISOlat1 -->
+<!ENTITY aelig  "&#230;" ><!-- latin small ae = latin small ligature ae, U+00E6 ISOlat1 -->
+<!ENTITY ccedil "&#231;" ><!-- latin small c with cedilla, U+00E7 ISOlat1 -->
+<!ENTITY egrave "&#232;" ><!-- latin small e with grave, U+00E8 ISOlat1 -->
+<!ENTITY eacute "&#233;" ><!-- latin small e with acute, U+00E9 ISOlat1 -->
+<!ENTITY ecirc  "&#234;" ><!-- latin small e with circumflex, U+00EA ISOlat1 -->
+<!ENTITY euml   "&#235;" ><!-- latin small e with diaeresis, U+00EB ISOlat1 -->
+<!ENTITY igrave "&#236;" ><!-- latin small i with grave, U+00EC ISOlat1 -->
+<!ENTITY iacute "&#237;" ><!-- latin small i with acute, U+00ED ISOlat1 -->
+<!ENTITY icirc  "&#238;" ><!-- latin small i with circumflex, U+00EE ISOlat1 -->
+<!ENTITY iuml   "&#239;" ><!-- latin small i with diaeresis, U+00EF ISOlat1 -->
+<!ENTITY eth    "&#240;" ><!-- latin small eth, U+00F0 ISOlat1 -->
+<!ENTITY ntilde "&#241;" ><!-- latin small n with tilde, U+00F1 ISOlat1 -->
+<!ENTITY ograve "&#242;" ><!-- latin small o with grave, U+00F2 ISOlat1 -->
+<!ENTITY oacute "&#243;" ><!-- latin small o with acute, U+00F3 ISOlat1 -->
+<!ENTITY ocirc  "&#244;" ><!-- latin small o with circumflex, U+00F4 ISOlat1 -->
+<!ENTITY otilde "&#245;" ><!-- latin small o with tilde, U+00F5 ISOlat1 -->
+<!ENTITY ouml   "&#246;" ><!-- latin small o with diaeresis, U+00F6 ISOlat1 -->
+<!ENTITY divide "&#247;" ><!-- division sign, U+00F7 ISOnum -->
+<!ENTITY oslash "&#248;" ><!-- latin small o with stroke, = latin small o slash, U+00F8 ISOlat1 -->
+<!ENTITY ugrave "&#249;" ><!-- latin small u with grave, U+00F9 ISOlat1 -->
+<!ENTITY uacute "&#250;" ><!-- latin small u with acute, U+00FA ISOlat1 -->
+<!ENTITY ucirc  "&#251;" ><!-- latin small u with circumflex, U+00FB ISOlat1 -->
+<!ENTITY uuml   "&#252;" ><!-- latin small u with diaeresis, U+00FC ISOlat1 -->
+<!ENTITY yacute "&#253;" ><!-- latin small y with acute, U+00FD ISOlat1 -->
+<!ENTITY thorn  "&#254;" ><!-- latin small thorn with, U+00FE ISOlat1 -->
+<!ENTITY yuml   "&#255;" ><!-- latin small y with diaeresis, U+00FF ISOlat1 -->
+<!-- end of xhtml-lat1.ent -->
+
+F.1.2. XHTML Special Characters
+
+You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-special.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-special.ent.
+
+<!-- ...................................................................... -->
+<!-- XML-compatible ISO Special Character Entity Set for XHTML ............ -->
+<!-- file: xhtml-special.ent
+
+     Typical invocation:
+
+       <!ENTITY % xhtml-special
+           PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
+                  "xhtml-special.ent" >
+       %xhtml-special;
+
+     This DTD module is identified by the PUBLIC and SYSTEM identifiers:
+
+       PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
+       SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-special.ent"
+
+     Revision:  $Id: xhtml-special.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
+
+     Portions (C) International Organization for Standardization 1986:
+     Permission to copy in any form is granted for use with conforming
+     SGML systems and applications as defined in ISO 8879, provided
+     this notice is included in all copies.
+
+     Revisions:
+2000-10-28: added &apos; and altered XML Predefined Entities for compatibility
+-->
+
+<!-- Relevant ISO entity set is given unless names are newly introduced.
+     New names (i.e., not in ISO 8879 [SGML] list) do not clash with
+     any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
+     numbers are given for each character, in hex. Entity values are
+     decimal conversions of the ISO 10646 values and refer to the
+     document character set. Names are Unicode [UNICODE] names.
+-->
+
+<!-- C0 Controls and Basic Latin -->
+<!ENTITY lt      "&#38;#60;" ><!-- less-than sign, U+003C ISOnum -->
+<!ENTITY gt      "&#62;" ><!-- greater-than sign, U+003E ISOnum -->
+<!ENTITY amp     "&#38;#38;" ><!-- ampersand, U+0026 ISOnum -->
+<!ENTITY apos    "&#39;" ><!-- The Apostrophe (Apostrophe Quote, APL Quote), U+0027 ISOnum -->
+<!ENTITY quot    "&#34;" ><!-- quotation mark (Quote Double), U+0022 ISOnum -->
+
+<!-- Latin Extended-A -->
+<!ENTITY OElig   "&#338;" ><!-- latin capital ligature OE, U+0152 ISOlat2 -->
+<!ENTITY oelig   "&#339;" ><!-- latin small ligature oe, U+0153 ISOlat2 -->
+
+<!-- ligature is a misnomer, this is a separate character in some languages -->
+<!ENTITY Scaron  "&#352;" ><!-- latin capital letter S with caron, U+0160 ISOlat2 -->
+<!ENTITY scaron  "&#353;" ><!-- latin small letter s with caron, U+0161 ISOlat2 -->
+<!ENTITY Yuml    "&#376;" ><!-- latin capital letter Y with diaeresis, U+0178 ISOlat2 -->
+
+<!-- Spacing Modifier Letters -->
+<!ENTITY circ    "&#710;" ><!-- modifier letter circumflex accent, U+02C6 ISOpub -->
+<!ENTITY tilde   "&#732;" ><!-- small tilde, U+02DC ISOdia -->
+
+<!-- General Punctuation -->
+<!ENTITY ensp    "&#8194;" ><!-- en space, U+2002 ISOpub -->
+<!ENTITY emsp    "&#8195;" ><!-- em space, U+2003 ISOpub -->
+<!ENTITY thinsp  "&#8201;" ><!-- thin space, U+2009 ISOpub -->
+<!ENTITY zwnj    "&#8204;" ><!-- zero width non-joiner, U+200C NEW RFC 2070 -->
+<!ENTITY zwj     "&#8205;" ><!-- zero width joiner, U+200D NEW RFC 2070 -->
+<!ENTITY lrm     "&#8206;" ><!-- left-to-right mark, U+200E NEW RFC 2070 -->
+<!ENTITY rlm     "&#8207;" ><!-- right-to-left mark, U+200F NEW RFC 2070 -->
+<!ENTITY ndash   "&#8211;" ><!-- en dash, U+2013 ISOpub -->
+<!ENTITY mdash   "&#8212;" ><!-- em dash, U+2014 ISOpub -->
+<!ENTITY lsquo   "&#8216;" ><!-- left single quotation mark, U+2018 ISOnum -->
+<!ENTITY rsquo   "&#8217;" ><!-- right single quotation mark, U+2019 ISOnum -->
+<!ENTITY sbquo   "&#8218;" ><!-- single low-9 quotation mark, U+201A NEW -->
+<!ENTITY ldquo   "&#8220;" ><!-- left double quotation mark, U+201C ISOnum -->
+<!ENTITY rdquo   "&#8221;" ><!-- right double quotation mark, U+201D ISOnum -->
+<!ENTITY bdquo   "&#8222;" ><!-- double low-9 quotation mark, U+201E NEW -->
+<!ENTITY dagger  "&#8224;" ><!-- dagger, U+2020 ISOpub -->
+<!ENTITY Dagger  "&#8225;" ><!-- double dagger, U+2021 ISOpub -->
+<!ENTITY permil  "&#8240;" ><!-- per mille sign, U+2030 ISOtech -->
+
+<!-- lsaquo is proposed but not yet ISO standardized -->
+<!ENTITY lsaquo  "&#8249;" ><!-- single left-pointing angle quotation mark, U+2039 ISO proposed -->
+<!-- rsaquo is proposed but not yet ISO standardized -->
+<!ENTITY rsaquo  "&#8250;" ><!-- single right-pointing angle quotation mark, U+203A ISO proposed -->
+<!ENTITY euro    "&#8364;" ><!-- euro sign, U+20AC NEW -->
+
+<!-- end of xhtml-special.ent -->
+
+F.1.3. XHTML Mathematical, Greek, and Symbolic Characters
+
+You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-symbol.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent.
+
+<!-- ...................................................................... -->
+<!-- ISO Math, Greek and Symbolic Character Entity Set for XHTML .......... -->
+<!-- file: xhtml-symbol.ent
+
+     Typical invocation:
+
+       <!ENTITY % xhtml-symbol
+           PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
+                  "xhtml-symbol.ent" >
+       %xhtml-symbol;
+
+     This DTD module is identified by the PUBLIC and SYSTEM identifiers:
+
+       PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
+       SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent"
+
+     Revision:  $Id: xhtml-symbol.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
+
+     Portions (C) International Organization for Standardization 1986:
+     Permission to copy in any form is granted for use with conforming
+     SGML systems and applications as defined in ISO 8879, provided
+     this notice is included in all copies.
+-->
+
+<!-- Relevant ISO entity set is given unless names are newly introduced.
+     New names (i.e., not in ISO 8879 [SGML] list) do not clash with
+     any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
+     numbers are given for each character, in hex. Entity values are
+     decimal conversions of the ISO 10646 values and refer to the
+     document character set. Names are Unicode [UNICODE] names.
+-->
+
+<!-- Latin Extended-B -->
+<!ENTITY fnof     "&#402;" ><!-- latin small f with hook = function
+                              = florin, U+0192 ISOtech -->
+
+<!-- Greek -->
+<!ENTITY Alpha    "&#913;" ><!-- greek capital letter alpha, U+0391 -->
+<!ENTITY Beta     "&#914;" ><!-- greek capital letter beta, U+0392 -->
+<!ENTITY Gamma    "&#915;" ><!-- greek capital letter gamma, U+0393 ISOgrk3 -->
+<!ENTITY Delta    "&#916;" ><!-- greek capital letter delta, U+0394 ISOgrk3 -->
+<!ENTITY Epsilon  "&#917;" ><!-- greek capital letter epsilon, U+0395 -->
+<!ENTITY Zeta     "&#918;" ><!-- greek capital letter zeta, U+0396 -->
+<!ENTITY Eta      "&#919;" ><!-- greek capital letter eta, U+0397 -->
+<!ENTITY Theta    "&#920;" ><!-- greek capital letter theta, U+0398 ISOgrk3 -->
+<!ENTITY Iota     "&#921;" ><!-- greek capital letter iota, U+0399 -->
+<!ENTITY Kappa    "&#922;" ><!-- greek capital letter kappa, U+039A -->
+<!ENTITY Lambda   "&#923;" ><!-- greek capital letter lambda, U+039B ISOgrk3 -->
+<!ENTITY Mu       "&#924;" ><!-- greek capital letter mu, U+039C -->
+<!ENTITY Nu       "&#925;" ><!-- greek capital letter nu, U+039D -->
+<!ENTITY Xi       "&#926;" ><!-- greek capital letter xi, U+039E ISOgrk3 -->
+<!ENTITY Omicron  "&#927;" ><!-- greek capital letter omicron, U+039F -->
+<!ENTITY Pi       "&#928;" ><!-- greek capital letter pi, U+03A0 ISOgrk3 -->
+<!ENTITY Rho      "&#929;" ><!-- greek capital letter rho, U+03A1 -->
+<!-- there is no Sigmaf, and no U+03A2 character either -->
+<!ENTITY Sigma    "&#931;" ><!-- greek capital letter sigma, U+03A3 ISOgrk3 -->
+<!ENTITY Tau      "&#932;" ><!-- greek capital letter tau, U+03A4 -->
+<!ENTITY Upsilon  "&#933;" ><!-- greek capital letter upsilon,
+                              U+03A5 ISOgrk3 -->
+<!ENTITY Phi      "&#934;" ><!-- greek capital letter phi, U+03A6 ISOgrk3 -->
+<!ENTITY Chi      "&#935;" ><!-- greek capital letter chi, U+03A7 -->
+<!ENTITY Psi      "&#936;" ><!-- greek capital letter psi, U+03A8 ISOgrk3 -->
+<!ENTITY Omega    "&#937;" ><!-- greek capital letter omega, U+03A9 ISOgrk3 -->
+<!ENTITY alpha    "&#945;" ><!-- greek small letter alpha, U+03B1 ISOgrk3 -->
+<!ENTITY beta     "&#946;" ><!-- greek small letter beta, U+03B2 ISOgrk3 -->
+<!ENTITY gamma    "&#947;" ><!-- greek small letter gamma, U+03B3 ISOgrk3 -->
+<!ENTITY delta    "&#948;" ><!-- greek small letter delta, U+03B4 ISOgrk3 -->
+<!ENTITY epsilon  "&#949;" ><!-- greek small letter epsilon, U+03B5 ISOgrk3 -->
+<!ENTITY zeta     "&#950;" ><!-- greek small letter zeta, U+03B6 ISOgrk3 -->
+<!ENTITY eta      "&#951;" ><!-- greek small letter eta, U+03B7 ISOgrk3 -->
+<!ENTITY theta    "&#952;" ><!-- greek small letter theta, U+03B8 ISOgrk3 -->
+<!ENTITY iota     "&#953;" ><!-- greek small letter iota, U+03B9 ISOgrk3 -->
+<!ENTITY kappa    "&#954;" ><!-- greek small letter kappa, U+03BA ISOgrk3 -->
+<!ENTITY lambda   "&#955;" ><!-- greek small letter lambda, U+03BB ISOgrk3 -->
+<!ENTITY mu       "&#956;" ><!-- greek small letter mu, U+03BC ISOgrk3 -->
+<!ENTITY nu       "&#957;" ><!-- greek small letter nu, U+03BD ISOgrk3 -->
+<!ENTITY xi       "&#958;" ><!-- greek small letter xi, U+03BE ISOgrk3 -->
+<!ENTITY omicron  "&#959;" ><!-- greek small letter omicron, U+03BF NEW -->
+<!ENTITY pi       "&#960;" ><!-- greek small letter pi, U+03C0 ISOgrk3 -->
+<!ENTITY rho      "&#961;" ><!-- greek small letter rho, U+03C1 ISOgrk3 -->
+<!ENTITY sigmaf   "&#962;" ><!-- greek small letter final sigma, U+03C2 ISOgrk3 -->
+<!ENTITY sigma    "&#963;" ><!-- greek small letter sigma, U+03C3 ISOgrk3 -->
+<!ENTITY tau      "&#964;" ><!-- greek small letter tau, U+03C4 ISOgrk3 -->
+<!ENTITY upsilon  "&#965;" ><!-- greek small letter upsilon, U+03C5 ISOgrk3 -->
+<!ENTITY phi      "&#966;" ><!-- greek small letter phi, U+03C6 ISOgrk3 -->
+<!ENTITY chi      "&#967;" ><!-- greek small letter chi, U+03C7 ISOgrk3 -->
+<!ENTITY psi      "&#968;" ><!-- greek small letter psi, U+03C8 ISOgrk3 -->
+<!ENTITY omega    "&#969;" ><!-- greek small letter omega, U+03C9 ISOgrk3 -->
+<!ENTITY thetasym "&#977;" ><!-- greek small letter theta symbol, U+03D1 NEW -->
+<!ENTITY upsih    "&#978;" ><!-- greek upsilon with hook symbol, U+03D2 NEW -->
+<!ENTITY piv      "&#982;" ><!-- greek pi symbol, U+03D6 ISOgrk3 -->
+
+<!-- General Punctuation -->
+<!ENTITY bull     "&#8226;" ><!-- bullet = black small circle, U+2022 ISOpub  -->
+<!-- bullet is NOT the same as bullet operator, U+2219 -->
+<!ENTITY hellip   "&#8230;" ><!-- horizontal ellipsis = three dot leader, U+2026 ISOpub  -->
+<!ENTITY prime    "&#8242;" ><!-- prime = minutes = feet, U+2032 ISOtech -->
+<!ENTITY Prime    "&#8243;" ><!-- double prime = seconds = inches, U+2033 ISOtech -->
+<!ENTITY oline    "&#8254;" ><!-- overline = spacing overscore, U+203E NEW -->
+<!ENTITY frasl    "&#8260;" ><!-- fraction slash, U+2044 NEW -->
+
+<!-- Letterlike Symbols -->
+<!ENTITY weierp   "&#8472;" ><!-- script capital P = power set = Weierstrass p, U+2118 ISOamso -->
+<!ENTITY image    "&#8465;" ><!-- blackletter capital I = imaginary part, U+2111 ISOamso -->
+<!ENTITY real     "&#8476;" ><!-- blackletter capital R = real part symbol, U+211C ISOamso -->
+<!ENTITY trade    "&#8482;" ><!-- trade mark sign, U+2122 ISOnum -->
+<!ENTITY alefsym  "&#8501;" ><!-- alef symbol = first transfinite cardinal, U+2135 NEW -->
+<!-- alef symbol is NOT the same as hebrew letter alef, U+05D0 although
+     the same glyph could be used to depict both characters -->
+
+<!-- Arrows -->
+<!ENTITY larr     "&#8592;" ><!-- leftwards arrow, U+2190 ISOnum -->
+<!ENTITY uarr     "&#8593;" ><!-- upwards arrow, U+2191 ISOnum-->
+<!ENTITY rarr     "&#8594;" ><!-- rightwards arrow, U+2192 ISOnum -->
+<!ENTITY darr     "&#8595;" ><!-- downwards arrow, U+2193 ISOnum -->
+<!ENTITY harr     "&#8596;" ><!-- left right arrow, U+2194 ISOamsa -->
+<!ENTITY crarr    "&#8629;" ><!-- downwards arrow with corner leftwards
+                               = carriage return, U+21B5 NEW -->
+<!ENTITY lArr     "&#8656;" ><!-- leftwards double arrow, U+21D0 ISOtech -->
+<!-- Unicode does not say that lArr is the same as the 'is implied by' arrow
+    but also does not have any other character for that function. So ? lArr can
+    be used for 'is implied by' as ISOtech suggests -->
+<!ENTITY uArr     "&#8657;" ><!-- upwards double arrow, U+21D1 ISOamsa -->
+<!ENTITY rArr     "&#8658;" ><!-- rightwards double arrow, U+21D2 ISOtech -->
+<!-- Unicode does not say this is the 'implies' character but does not have
+     another character with this function so ?
+     rArr can be used for 'implies' as ISOtech suggests -->
+<!ENTITY dArr     "&#8659;" ><!-- downwards double arrow, U+21D3 ISOamsa -->
+<!ENTITY hArr     "&#8660;" ><!-- left right double arrow, U+21D4 ISOamsa -->
+
+<!-- Mathematical Operators -->
+<!ENTITY forall   "&#8704;" ><!-- for all, U+2200 ISOtech -->
+<!ENTITY part     "&#8706;" ><!-- partial differential, U+2202 ISOtech  -->
+<!ENTITY exist    "&#8707;" ><!-- there exists, U+2203 ISOtech -->
+<!ENTITY empty    "&#8709;" ><!-- empty set = null set, U+2205 ISOamso -->
+<!ENTITY nabla    "&#8711;" ><!-- nabla = backward difference, U+2207 ISOtech -->
+<!ENTITY isin     "&#8712;" ><!-- element of, U+2208 ISOtech -->
+<!ENTITY notin    "&#8713;" ><!-- not an element of, U+2209 ISOtech -->
+<!ENTITY ni       "&#8715;" ><!-- contains as member, U+220B ISOtech -->
+<!-- should there be a more memorable name than 'ni'? -->
+<!ENTITY prod     "&#8719;" ><!-- n-ary product = product sign, U+220F ISOamsb -->
+<!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
+     the same glyph might be used for both -->
+<!ENTITY sum      "&#8721;" ><!-- n-ary sumation, U+2211 ISOamsb -->
+<!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
+     though the same glyph might be used for both -->
+<!ENTITY minus    "&#8722;" ><!-- minus sign, U+2212 ISOtech -->
+<!ENTITY lowast   "&#8727;" ><!-- asterisk operator, U+2217 ISOtech -->
+<!ENTITY radic    "&#8730;" ><!-- square root = radical sign, U+221A ISOtech -->
+<!ENTITY prop     "&#8733;" ><!-- proportional to, U+221D ISOtech -->
+<!ENTITY infin    "&#8734;" ><!-- infinity, U+221E ISOtech -->
+<!ENTITY ang      "&#8736;" ><!-- angle, U+2220 ISOamso -->
+<!ENTITY and      "&#8743;" ><!-- logical and = wedge, U+2227 ISOtech -->
+<!ENTITY or       "&#8744;" ><!-- logical or = vee, U+2228 ISOtech -->
+<!ENTITY cap      "&#8745;" ><!-- intersection = cap, U+2229 ISOtech -->
+<!ENTITY cup      "&#8746;" ><!-- union = cup, U+222A ISOtech -->
+<!ENTITY int      "&#8747;" ><!-- integral, U+222B ISOtech -->
+<!ENTITY there4   "&#8756;" ><!-- therefore, U+2234 ISOtech -->
+<!ENTITY sim      "&#8764;" ><!-- tilde operator = varies with = similar to, U+223C ISOtech -->
+<!-- tilde operator is NOT the same character as the tilde, U+007E,
+     although the same glyph might be used to represent both  -->
+<!ENTITY cong     "&#8773;" ><!-- approximately equal to, U+2245 ISOtech -->
+<!ENTITY asymp    "&#8776;" ><!-- almost equal to = asymptotic to, U+2248 ISOamsr -->
+<!ENTITY ne       "&#8800;" ><!-- not equal to, U+2260 ISOtech -->
+<!ENTITY equiv    "&#8801;" ><!-- identical to, U+2261 ISOtech -->
+<!ENTITY le       "&#8804;" ><!-- less-than or equal to, U+2264 ISOtech -->
+<!ENTITY ge       "&#8805;" ><!-- greater-than or equal to, U+2265 ISOtech -->
+<!ENTITY sub      "&#8834;" ><!-- subset of, U+2282 ISOtech -->
+<!ENTITY sup      "&#8835;" ><!-- superset of, U+2283 ISOtech -->
+<!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
+     font encoding and is not included. Should it be, for symmetry?
+     It is in ISOamsn  -->
+<!ENTITY nsub     "&#8836;" ><!-- not a subset of, U+2284 ISOamsn -->
+<!ENTITY sube     "&#8838;" ><!-- subset of or equal to, U+2286 ISOtech -->
+<!ENTITY supe     "&#8839;" ><!-- superset of or equal to, U+2287 ISOtech -->
+<!ENTITY oplus    "&#8853;" ><!-- circled plus = direct sum, U+2295 ISOamsb -->
+<!ENTITY otimes   "&#8855;" ><!-- circled times = vector product, U+2297 ISOamsb -->
+<!ENTITY perp     "&#8869;" ><!-- up tack = orthogonal to = perpendicular, U+22A5 ISOtech -->
+<!ENTITY sdot     "&#8901;" ><!-- dot operator, U+22C5 ISOamsb -->
+<!-- dot operator is NOT the same character as U+00B7 middle dot -->
+
+<!-- Miscellaneous Technical -->
+<!ENTITY lceil    "&#8968;" ><!-- left ceiling = apl upstile, U+2308 ISOamsc  -->
+<!ENTITY rceil    "&#8969;" ><!-- right ceiling, U+2309 ISOamsc  -->
+<!ENTITY lfloor   "&#8970;" ><!-- left floor = apl downstile, U+230A ISOamsc  -->
+<!ENTITY rfloor   "&#8971;" ><!-- right floor, U+230B ISOamsc  -->
+<!ENTITY lang     "&#9001;" ><!-- left-pointing angle bracket = bra, U+2329 ISOtech -->
+<!-- lang is NOT the same character as U+003C 'less than'
+     or U+2039 'single left-pointing angle quotation mark' -->
+<!ENTITY rang     "&#9002;" ><!-- right-pointing angle bracket = ket, U+232A ISOtech -->
+<!-- rang is NOT the same character as U+003E 'greater than'
+     or U+203A 'single right-pointing angle quotation mark' -->
+
+<!-- Geometric Shapes -->
+<!ENTITY loz      "&#9674;" ><!-- lozenge, U+25CA ISOpub -->
+
+<!-- Miscellaneous Symbols -->
+<!ENTITY spades   "&#9824;" ><!-- black spade suit, U+2660 ISOpub -->
+<!-- black here seems to mean filled as opposed to hollow -->
+<!ENTITY clubs    "&#9827;" ><!-- black club suit = shamrock, U+2663 ISOpub -->
+<!ENTITY hearts   "&#9829;" ><!-- black heart suit = valentine, U+2665 ISOpub -->
+<!ENTITY diams    "&#9830;" ><!-- black diamond suit, U+2666 ISOpub -->
+
+<!-- end of xhtml-symbol.ent -->
+"""
+  return text
+
+def get_apache_license():
+  license = r"""/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+"""
+  return license
+
+main()
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/package.html
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/charfilter/package.html
@ -0,0 +1,61 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+<p>
+Normalization of text before the tokenizer.
+</p>
+<p>
+  CharFilters are chainable filters that normalize text before tokenization 
+  and provide mappings between normalized text offsets and the corresponding 
+  offset in the original text.
+</p>
+<H2>CharFilter offset mappings</H2>
+<p>
+  CharFilters modify an input stream via a series of substring
+  replacements (including deletions and insertions) to produce an output
+  stream. There are three possible replacement cases: the replacement
+  string has the same length as the original substring; the replacement
+  is shorter; and the replacement is longer. In the latter two cases
+  (when the replacement has a different length than the original),
+  one or more offset correction mappings are required.
+</p>
+<p>
+  When the replacement is shorter than the original (e.g. when the
+  replacement is the empty string), a single offset correction mapping
+  should be added at the replacement's end offset in the output stream.
+  The <code>cumulativeDiff</code> parameter to the
+  <code>addOffCorrectMapping()</code> method will be the sum of all
+  previous replacement offset adjustments, with the addition of the
+  difference between the lengths of the original substring and the
+  replacement string (a positive value).
+</p>
+<p>
+  When the replacement is longer than the original (e.g. when the
+  original is the empty string), you should add as many offset
+  correction mappings as the difference between the lengths of the
+  replacement string and the original substring, starting at the
+  end offset the original substring would have had in the output stream.
+  The <code>cumulativeDiff</code> parameter to the
+  <code>addOffCorrectMapping()</code> method will be the sum of all
+  previous replacement offset adjustments, with the addition of the
+  difference between the lengths of the original substring and the
+  replacement string so far (a negative value).
+</p>
+</body>
+</html>
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
@ -0,0 +1,104 @@
+package com.fr.third.org.apache.lucene.analysis.cjk;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import com.fr.third.org.apache.lucene.analysis.Analyzer;
+import com.fr.third.org.apache.lucene.analysis.TokenStream;
+import com.fr.third.org.apache.lucene.analysis.Tokenizer;
+import com.fr.third.org.apache.lucene.analysis.core.LowerCaseFilter;
+import com.fr.third.org.apache.lucene.analysis.core.StopFilter;
+import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer;
+import com.fr.third.org.apache.lucene.analysis.util.CharArraySet;
+import com.fr.third.org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+import com.fr.third.org.apache.lucene.util.Version;
+
+/**
+ * An {@link Analyzer} that tokenizes text with {@link StandardTokenizer},
+ * normalizes content with {@link CJKWidthFilter}, folds case with
+ * {@link LowerCaseFilter}, forms bigrams of CJK with {@link CJKBigramFilter},
+ * and filters stopwords with {@link StopFilter}
+ */
+public final class CJKAnalyzer extends StopwordAnalyzerBase {
+  /**
+   * File containing default CJK stopwords.
+   * <p/>
+   * Currently it contains some common English words that are not usually
+   * useful for searching and some double-byte interpunctions.
+   */
+  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+  /**
+   * Returns an unmodifiable instance of the default stop-words set.
+   * @return an unmodifiable instance of the default stop-words set.
+   */
+  public static CharArraySet getDefaultStopSet(){
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  private static class DefaultSetHolder {
+    static final CharArraySet DEFAULT_STOP_SET;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = loadStopwordSet(false, CJKAnalyzer.class, DEFAULT_STOPWORD_FILE, "#");
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+  }
+
+  /**
+   * Builds an analyzer which removes words in {@link #getDefaultStopSet()}.
+   */
+  public CJKAnalyzer(Version matchVersion) {
+    this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
+  }
+  
+  /**
+   * Builds an analyzer with the given stop words
+   * 
+   * @param matchVersion
+   *          lucene compatibility version
+   * @param stopwords
+   *          a stopword set
+   */
+  public CJKAnalyzer(Version matchVersion, CharArraySet stopwords){
+    super(matchVersion, stopwords);
+  }
+
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    if (matchVersion.onOrAfter(Version.LUCENE_36)) {
+      final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+      // run the widthfilter first before bigramming, it sometimes combines characters.
+      TokenStream result = new CJKWidthFilter(source);
+      result = new LowerCaseFilter(matchVersion, result);
+      result = new CJKBigramFilter(result);
+      return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
+    } else {
+      final Tokenizer source = new CJKTokenizer(reader);
+      return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
+    }
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKBigramFilter.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKBigramFilter.java
@ -0,0 +1,363 @@
+package com.fr.third.org.apache.lucene.analysis.cjk;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import com.fr.third.org.apache.lucene.analysis.TokenFilter;
+import com.fr.third.org.apache.lucene.analysis.TokenStream;
+import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import com.fr.third.org.apache.lucene.util.ArrayUtil;
+
+/**
+ * Forms bigrams of CJK terms that are generated from StandardTokenizer
+ * or ICUTokenizer.
+ * <p>
+ * CJK types are set by these tokenizers, but you can also use 
+ * {@link #CJKBigramFilter(TokenStream, int)} to explicitly control which
+ * of the CJK scripts are turned into bigrams.
+ * <p>
+ * By default, when a CJK character has no adjacent characters to form
+ * a bigram, it is output in unigram form. If you want to always output
+ * both unigrams and bigrams, set the <code>outputUnigrams</code>
+ * flag in {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)}.
+ * This can be used for a combined unigram+bigram approach.
+ * <p>
+ * In all cases, all non-CJK input is passed thru unmodified.
+ */
+public final class CJKBigramFilter extends TokenFilter {
+  // configuration
+  /** bigram flag for Han Ideographs */
+  public static final int HAN = 1;
+  /** bigram flag for Hiragana */
+  public static final int HIRAGANA = 2;
+  /** bigram flag for Katakana */
+  public static final int KATAKANA = 4;
+  /** bigram flag for Hangul */
+  public static final int HANGUL = 8;
+
+  /** when we emit a bigram, its then marked as this type */
+  public static final String DOUBLE_TYPE = "<DOUBLE>";
+  /** when we emit a unigram, its then marked as this type */
+  public static final String SINGLE_TYPE = "<SINGLE>";
+
+  // the types from standardtokenizer
+  private static final String HAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
+  private static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
+  private static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
+  private static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
+  
+  // sentinel value for ignoring a script 
+  private static final Object NO = new Object();
+
+  // these are set to either their type or NO if we want to pass them thru
+  private final Object doHan;
+  private final Object doHiragana;
+  private final Object doKatakana;
+  private final Object doHangul;
+  
+  // true if we should output unigram tokens always
+  private final boolean outputUnigrams;
+  private boolean ngramState; // false = output unigram, true = output bigram
+    
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
+  
+  // buffers containing codepoint and offsets in parallel
+  int buffer[] = new int[8];
+  int startOffset[] = new int[8];
+  int endOffset[] = new int[8];
+  // length of valid buffer
+  int bufferLen;
+  // current buffer index
+  int index;
+  
+  // the last end offset, to determine if we should bigram across tokens
+  int lastEndOffset;
+  
+  private boolean exhausted;
+  
+  /** 
+   * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int)
+   *       CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)}
+   */
+  public CJKBigramFilter(TokenStream in) {
+    this(in, HAN | HIRAGANA | KATAKANA | HANGUL);
+  }
+  
+  /** 
+   * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)
+   *       CJKBigramFilter(in, flags, false)}
+   */
+  public CJKBigramFilter(TokenStream in, int flags) {
+    this(in, flags, false);
+  }
+  
+  /**
+   * Create a new CJKBigramFilter, specifying which writing systems should be bigrammed,
+   * and whether or not unigrams should also be output.
+   * @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA}, 
+   *        {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
+   * @param outputUnigrams true if unigrams for the selected writing systems should also be output.
+   *        when this is false, this is only done when there are no adjacent characters to form
+   *        a bigram.
+   */
+  public CJKBigramFilter(TokenStream in, int flags, boolean outputUnigrams) {
+    super(in);
+    doHan =      (flags & HAN) == 0      ? NO : HAN_TYPE;
+    doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
+    doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE;
+    doHangul =   (flags & HANGUL) == 0   ? NO : HANGUL_TYPE;
+    this.outputUnigrams = outputUnigrams;
+  }
+  
+  /*
+   * much of this complexity revolves around handling the special case of a 
+   * "lone cjk character" where cjktokenizer would output a unigram. this 
+   * is also the only time we ever have to captureState.
+   */
+  @Override
+  public boolean incrementToken() throws IOException {
+    while (true) {
+      if (hasBufferedBigram()) {
+        
+        // case 1: we have multiple remaining codepoints buffered,
+        // so we can emit a bigram here.
+        
+        if (outputUnigrams) {
+
+          // when also outputting unigrams, we output the unigram first,
+          // then rewind back to revisit the bigram.
+          // so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
+          // the logic in hasBufferedUnigram ensures we output the C, 
+          // even though it did actually have adjacent CJK characters.
+
+          if (ngramState) {
+            flushBigram();
+          } else {
+            flushUnigram();
+            index--;
+          }
+          ngramState = !ngramState;
+        } else {
+          flushBigram();
+        }
+        return true;
+      } else if (doNext()) {
+        
+        // case 2: look at the token type. should we form any n-grams?
+        
+        String type = typeAtt.type();
+        if (type == doHan || type == doHiragana || type == doKatakana || type == doHangul) {
+          
+          // acceptable CJK type: we form n-grams from these.
+          // as long as the offsets are aligned, we just add these to our current buffer.
+          // otherwise, we clear the buffer and start over.
+          
+          if (offsetAtt.startOffset() != lastEndOffset) { // unaligned, clear queue
+            if (hasBufferedUnigram()) {
+              
+              // we have a buffered unigram, and we peeked ahead to see if we could form
+              // a bigram, but we can't, because the offsets are unaligned. capture the state 
+              // of this peeked data to be revisited next time thru the loop, and dump our unigram.
+              
+              loneState = captureState();
+              flushUnigram();
+              return true;
+            }
+            index = 0;
+            bufferLen = 0;
+          }
+          refill();
+        } else {
+          
+          // not a CJK type: we just return these as-is.
+          
+          if (hasBufferedUnigram()) {
+            
+            // we have a buffered unigram, and we peeked ahead to see if we could form
+            // a bigram, but we can't, because its not a CJK type. capture the state 
+            // of this peeked data to be revisited next time thru the loop, and dump our unigram.
+            
+            loneState = captureState();
+            flushUnigram();
+            return true;
+          }
+          return true;
+        }
+      } else {
+        
+        // case 3: we have only zero or 1 codepoints buffered, 
+        // so not enough to form a bigram. But, we also have no
+        // more input. So if we have a buffered codepoint, emit
+        // a unigram, otherwise, its end of stream.
+        
+        if (hasBufferedUnigram()) {
+          flushUnigram(); // flush our remaining unigram
+          return true;
+        }
+        return false;
+      }
+    }
+  }
+  
+  private State loneState; // rarely used: only for "lone cjk characters", where we emit unigrams
+  
+  /** 
+   * looks at next input token, returning false is none is available 
+   */
+  private boolean doNext() throws IOException {
+    if (loneState != null) {
+      restoreState(loneState);
+      loneState = null;
+      return true;
+    } else {
+      if (exhausted) {
+        return false;
+      } else if (input.incrementToken()) {
+        return true;
+      } else {
+        exhausted = true;
+        return false;
+      }
+    }
+  }
+  
+  /**
+   * refills buffers with new data from the current token.
+   */
+  private void refill() {
+    // compact buffers to keep them smallish if they become large
+    // just a safety check, but technically we only need the last codepoint
+    if (bufferLen > 64) {
+      int last = bufferLen - 1;
+      buffer[0] = buffer[last];
+      startOffset[0] = startOffset[last];
+      endOffset[0] = endOffset[last];
+      bufferLen = 1;
+      index -= last;
+    }
+
+    char termBuffer[] = termAtt.buffer();
+    int len = termAtt.length();
+    int start = offsetAtt.startOffset();
+    int end = offsetAtt.endOffset();
+    
+    int newSize = bufferLen + len;
+    buffer = ArrayUtil.grow(buffer, newSize);
+    startOffset = ArrayUtil.grow(startOffset, newSize);
+    endOffset = ArrayUtil.grow(endOffset, newSize);
+    lastEndOffset = end;
+
+    if (end - start != len) {
+      // crazy offsets (modified by synonym or charfilter): just preserve
+      for (int i = 0, cp = 0; i < len; i += Character.charCount(cp)) {
+        cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len);
+        startOffset[bufferLen] = start;
+        endOffset[bufferLen] = end;
+        bufferLen++;
+      }
+    } else {
+      // normal offsets
+      for (int i = 0, cp = 0, cpLen = 0; i < len; i += cpLen) {
+        cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len);
+        cpLen = Character.charCount(cp);
+        startOffset[bufferLen] = start;
+        start = endOffset[bufferLen] = start + cpLen;
+        bufferLen++;
+      }
+    }
+  }
+
+  /** 
+   * Flushes a bigram token to output from our buffer 
+   * This is the normal case, e.g. ABC -> AB BC
+   */
+  private void flushBigram() {
+    clearAttributes();
+    char termBuffer[] = termAtt.resizeBuffer(4); // maximum bigram length in code units (2 supplementaries)
+    int len1 = Character.toChars(buffer[index], termBuffer, 0);
+    int len2 = len1 + Character.toChars(buffer[index+1], termBuffer, len1);
+    termAtt.setLength(len2);
+    offsetAtt.setOffset(startOffset[index], endOffset[index+1]);
+    typeAtt.setType(DOUBLE_TYPE);
+    // when outputting unigrams, all bigrams are synonyms that span two unigrams
+    if (outputUnigrams) {
+      posIncAtt.setPositionIncrement(0);
+      posLengthAtt.setPositionLength(2);
+    }
+    index++;
+  }
+  
+  /** 
+   * Flushes a unigram token to output from our buffer.
+   * This happens when we encounter isolated CJK characters, either the whole
+   * CJK string is a single character, or we encounter a CJK character surrounded 
+   * by space, punctuation, english, etc, but not beside any other CJK.
+   */
+  private void flushUnigram() {
+    clearAttributes();
+    char termBuffer[] = termAtt.resizeBuffer(2); // maximum unigram length (2 surrogates)
+    int len = Character.toChars(buffer[index], termBuffer, 0);
+    termAtt.setLength(len);
+    offsetAtt.setOffset(startOffset[index], endOffset[index]);
+    typeAtt.setType(SINGLE_TYPE);
+    index++;
+  }
+  
+  /**
+   * True if we have multiple codepoints sitting in our buffer
+   */
+  private boolean hasBufferedBigram() {
+    return bufferLen - index > 1;
+  }
+
+  /**
+   * True if we have a single codepoint sitting in our buffer, where its future
+   * (whether it is emitted as unigram or forms a bigram) depends upon not-yet-seen
+   * inputs.
+   */
+  private boolean hasBufferedUnigram() {
+    if (outputUnigrams) {
+      // when outputting unigrams always
+      return bufferLen - index == 1;
+    } else {
+      // otherwise its only when we have a lone CJK character
+      return bufferLen == 1 && index == 0;
+    }
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    bufferLen = 0;
+    index = 0;
+    lastEndOffset = 0;
+    loneState = null;
+    exhausted = false;
+    ngramState = false;
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java
@ -0,0 +1,67 @@
+package com.fr.third.org.apache.lucene.analysis.cjk;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import com.fr.third.org.apache.lucene.analysis.TokenStream;
+import com.fr.third.org.apache.lucene.analysis.cjk.CJKBigramFilter;
+import com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/** 
+ * Factory for {@link CJKBigramFilter}.
+ * <pre class="prettyprint" >
+ * &lt;fieldType name="text_cjk" class="solr.TextField"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.CJKWidthFilterFactory"/&gt;
+ *     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
+ *     &lt;filter class="solr.CJKBigramFilterFactory" 
+ *       han="true" hiragana="true" 
+ *       katakana="true" hangul="true" outputUnigrams="false" /&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ */
+public class CJKBigramFilterFactory extends TokenFilterFactory {
+  int flags;
+  boolean outputUnigrams;
+
+  @Override
+  public void init(Map<String,String> args) {
+    super.init(args);
+    flags = 0;
+    if (getBoolean("han", true)) {
+      flags |= CJKBigramFilter.HAN;
+    }
+    if (getBoolean("hiragana", true)) {
+      flags |= CJKBigramFilter.HIRAGANA;
+    }
+    if (getBoolean("katakana", true)) {
+      flags |= CJKBigramFilter.KATAKANA;
+    }
+    if (getBoolean("hangul", true)) {
+      flags |= CJKBigramFilter.HANGUL;
+    }
+    outputUnigrams = getBoolean("outputUnigrams", false);
+  }
+  
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new CJKBigramFilter(input, flags, outputUnigrams);
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKTokenizer.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKTokenizer.java
@ -0,0 +1,311 @@
+package com.fr.third.org.apache.lucene.analysis.cjk;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import com.fr.third.org.apache.lucene.analysis.Tokenizer;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import com.fr.third.org.apache.lucene.util.AttributeSource;
+
+/**
+ * CJKTokenizer is designed for Chinese, Japanese, and Korean languages.
+ * <p>  
+ * The tokens returned are every two adjacent characters with overlap match.
+ * </p>
+ * <p>
+ * Example: "java C1C2C3C4" will be segmented to: "java" "C1C2" "C2C3" "C3C4".
+ * </p>
+ * Additionally, the following is applied to Latin text (such as English):
+ * <ul>
+ * <li>Text is converted to lowercase.
+ * <li>Numeric digits, '+', '#', and '_' are tokenized as letters.
+ * <li>Full-width forms are converted to half-width forms.
+ * </ul>
+ * For more info on Asian language (Chinese, Japanese, and Korean) text segmentation:
+ * please search  <a
+ * href="http://www.google.com/search?q=word+chinese+segment">google</a>
+ *
+ * @deprecated Use StandardTokenizer, CJKWidthFilter, CJKBigramFilter, and LowerCaseFilter instead.
+ */
+@Deprecated
+public final class CJKTokenizer extends Tokenizer {
+    //~ Static fields/initializers ---------------------------------------------
+    /** Word token type */
+    static final int WORD_TYPE = 0;
+  
+    /** Single byte token type */
+    static final int SINGLE_TOKEN_TYPE = 1;
+
+    /** Double byte token type */
+    static final int DOUBLE_TOKEN_TYPE = 2;
+  
+    /** Names for token types */
+    static final String[] TOKEN_TYPE_NAMES = { "word", "single", "double" };
+  
+    /** Max word length */
+    private static final int MAX_WORD_LEN = 255;
+
+    /** buffer size: */
+    private static final int IO_BUFFER_SIZE = 256;
+
+    //~ Instance fields --------------------------------------------------------
+
+    /** word offset, used to imply which character(in ) is parsed */
+    private int offset = 0;
+
+    /** the index used only for ioBuffer */
+    private int bufferIndex = 0;
+
+    /** data length */
+    private int dataLen = 0;
+
+    /**
+     * character buffer, store the characters which are used to compose <br>
+     * the returned Token
+     */
+    private final char[] buffer = new char[MAX_WORD_LEN];
+
+    /**
+     * I/O buffer, used to store the content of the input(one of the <br>
+     * members of Tokenizer)
+     */
+    private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+    /** word type: single=>ASCII  double=>non-ASCII word=>default */
+    private int tokenType = WORD_TYPE;
+
+    /**
+     * tag: previous character is a cached double-byte character  "C1C2C3C4"
+     * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
+     * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
+     */
+    private boolean preIsTokened = false;
+
+    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+    
+    //~ Constructors -----------------------------------------------------------
+
+    /**
+     * Construct a token stream processing the given input.
+     *
+     * @param in I/O reader
+     */
+    public CJKTokenizer(Reader in) {
+      super(in);
+    }
+
+    public CJKTokenizer(AttributeSource source, Reader in) {
+      super(source, in);
+    }
+
+    public CJKTokenizer(AttributeFactory factory, Reader in) {
+      super(factory, in);
+    }
+    
+    //~ Methods ----------------------------------------------------------------
+
+    /**
+     * Returns true for the next token in the stream, or false at EOS.
+     * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
+     * for detail.
+     *
+     * @return false for end of stream, true otherwise
+     *
+     * @throws IOException - throw IOException when read error <br>
+     *         happened in the InputStream
+     *
+     */
+    @Override
+    public boolean incrementToken() throws IOException {
+        clearAttributes();
+        /** how many character(s) has been stored in buffer */
+
+        while(true) { // loop until we find a non-empty token
+
+          int length = 0;
+
+          /** the position used to create Token */
+          int start = offset;
+
+          while (true) { // loop until we've found a full token
+            /** current character */
+            char c;
+
+            /** unicode block of current character for detail */
+            Character.UnicodeBlock ub;
+
+            offset++;
+
+            if (bufferIndex >= dataLen) {
+                dataLen = input.read(ioBuffer);
+                bufferIndex = 0;
+            }
+
+            if (dataLen == -1) {
+                if (length > 0) {
+                    if (preIsTokened == true) {
+                        length = 0;
+                        preIsTokened = false;
+                    }
+                    else{
+                      offset--;
+                    }
+
+                    break;
+                } else {
+                    offset--;
+                    return false;
+                }
+            } else {
+                //get current character
+                c = ioBuffer[bufferIndex++];
+
+                //get the UnicodeBlock of the current character
+                ub = Character.UnicodeBlock.of(c);
+            }
+
+            //if the current character is ASCII or Extend ASCII
+            if ((ub == Character.UnicodeBlock.BASIC_LATIN)
+                    || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
+               ) {
+                if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
+                  int i = (int) c;
+                  if (i >= 65281 && i <= 65374) {
+                    // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
+                    i = i - 65248;
+                    c = (char) i;
+                  }
+                }
+
+                // if the current character is a letter or "_" "+" "#"
+                if (Character.isLetterOrDigit(c)
+                        || ((c == '_') || (c == '+') || (c == '#'))
+                   ) {
+                    if (length == 0) {
+                        // "javaC1C2C3C4linux" <br>
+                        //      ^--: the current character begin to token the ASCII
+                        // letter
+                        start = offset - 1;
+                    } else if (tokenType == DOUBLE_TOKEN_TYPE) {
+                        // "javaC1C2C3C4linux" <br>
+                        //              ^--: the previous non-ASCII
+                        // : the current character
+                        offset--;
+                        bufferIndex--;
+
+                        if (preIsTokened == true) {
+                            // there is only one non-ASCII has been stored
+                            length = 0;
+                            preIsTokened = false;
+                            break;
+                        } else {
+                            break;
+                        }
+                    }
+
+                    // store the LowerCase(c) in the buffer
+                    buffer[length++] = Character.toLowerCase(c);
+                    tokenType = SINGLE_TOKEN_TYPE;
+
+                    // break the procedure if buffer overflowed!
+                    if (length == MAX_WORD_LEN) {
+                        break;
+                    }
+                } else if (length > 0) {
+                    if (preIsTokened == true) {
+                        length = 0;
+                        preIsTokened = false;
+                    } else {
+                        break;
+                    }
+                }
+            } else {
+                // non-ASCII letter, e.g."C1C2C3C4"
+                if (Character.isLetter(c)) {
+                    if (length == 0) {
+                        start = offset - 1;
+                        buffer[length++] = c;
+                        tokenType = DOUBLE_TOKEN_TYPE;
+                    } else {
+                      if (tokenType == SINGLE_TOKEN_TYPE) {
+                            offset--;
+                            bufferIndex--;
+
+                            //return the previous ASCII characters
+                            break;
+                        } else {
+                            buffer[length++] = c;
+                            tokenType = DOUBLE_TOKEN_TYPE;
+
+                            if (length == 2) {
+                                offset--;
+                                bufferIndex--;
+                                preIsTokened = true;
+
+                                break;
+                            }
+                        }
+                    }
+                } else if (length > 0) {
+                    if (preIsTokened == true) {
+                        // empty the buffer
+                        length = 0;
+                        preIsTokened = false;
+                    } else {
+                        break;
+                    }
+                }
+            }
+        }
+      
+        if (length > 0) {
+          termAtt.copyBuffer(buffer, 0, length);
+          offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
+          typeAtt.setType(TOKEN_TYPE_NAMES[tokenType]);
+          return true;
+        } else if (dataLen == -1) {
+          offset--;
+          return false;
+        }
+
+        // Cycle back and try for the next token (don't
+        // return an empty string)
+      }
+    }
+    
+    @Override
+    public final void end() {
+      // set final offset
+      final int finalOffset = correctOffset(offset);
+      this.offsetAtt.setOffset(finalOffset, finalOffset);
+    }
+    
+    @Override
+    public void reset() throws IOException {
+      super.reset();
+      offset = bufferIndex = dataLen = 0;
+      preIsTokened = false;
+      tokenType = WORD_TYPE;
+    }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKTokenizerFactory.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKTokenizerFactory.java
@ -0,0 +1,41 @@
+package com.fr.third.org.apache.lucene.analysis.cjk;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.fr.third.org.apache.lucene.analysis.cjk.CJKTokenizer;
+import com.fr.third.org.apache.lucene.analysis.util.TokenizerFactory;
+
+import java.io.Reader;
+
+/** 
+ * Factory for {@link CJKTokenizer}. 
+ * <pre class="prettyprint" >
+ * &lt;fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.CJKTokenizerFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ * @deprecated Use {@link CJKBigramFilterFactory} instead.
+ */
+@Deprecated
+public class CJKTokenizerFactory extends TokenizerFactory {
+  public CJKTokenizer create(Reader in) {
+    return new CJKTokenizer(in);
+  }
+}
+
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKWidthFilter.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKWidthFilter.java
@ -0,0 +1,112 @@
+package com.fr.third.org.apache.lucene.analysis.cjk;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import com.fr.third.org.apache.lucene.analysis.TokenFilter;
+import com.fr.third.org.apache.lucene.analysis.TokenStream;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import com.fr.third.org.apache.lucene.analysis.util.StemmerUtil;
+
+/**
+ * A {@link TokenFilter} that normalizes CJK width differences:
+ * <ul>
+ *   <li>Folds fullwidth ASCII variants into the equivalent basic latin
+ *   <li>Folds halfwidth Katakana variants into the equivalent kana
+ * </ul>
+ * <p>
+ * NOTE: this filter can be viewed as a (practical) subset of NFKC/NFKD
+ * Unicode normalization. See the normalization support in the ICU package
+ * for full normalization.
+ */
+public final class CJKWidthFilter extends TokenFilter {
+  private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  
+  /* halfwidth kana mappings: 0xFF65-0xFF9D 
+   *
+   * note: 0xFF9C and 0xFF9D are only mapped to 0x3099 and 0x309A
+   * as a fallback when they cannot properly combine with a preceding 
+   * character into a composed form.
+   */
+  private static final char KANA_NORM[] = new char[] {
+    0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5,
+    0x30e7, 0x30c3, 0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab,
+    0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd,
+    0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd,
+    0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, 0x30df, 0x30e0,
+    0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec,
+    0x30ed, 0x30ef, 0x30f3, 0x3099, 0x309A
+  };
+
+  public CJKWidthFilter(TokenStream input) {
+    super(input);
+  }
+
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      char text[] = termAtt.buffer();
+      int length = termAtt.length();
+      for (int i = 0; i < length; i++) {
+        final char ch = text[i];
+        if (ch >= 0xFF01 && ch <= 0xFF5E) {
+          // Fullwidth ASCII variants
+          text[i] -= 0xFEE0;
+        } else if (ch >= 0xFF65 && ch <= 0xFF9F) {
+          // Halfwidth Katakana variants
+          if ((ch == 0xFF9E || ch == 0xFF9F) && i > 0 && combine(text, i, ch)) {
+            length = StemmerUtil.delete(text, i--, length);
+          } else {
+            text[i] = KANA_NORM[ch - 0xFF65];
+          }
+        }
+      }
+      termAtt.setLength(length);
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /* kana combining diffs: 0x30A6-0x30FD */
+  private static final byte KANA_COMBINE_VOICED[] = new byte[] {
+    78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+     0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 
+     0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+     0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+  };
+  
+  private static final byte KANA_COMBINE_HALF_VOICED[] = new byte[] {
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, 
+     0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+  };
+  
+  /** returns true if we successfully combined the voice mark */
+  private static boolean combine(char text[], int pos, char ch) {
+    final char prev = text[pos-1];
+    if (prev >= 0x30A6 && prev <= 0x30FD) {
+      text[pos-1] += (ch == 0xFF9F)
+        ? KANA_COMBINE_HALF_VOICED[prev - 0x30A6] 
+        : KANA_COMBINE_VOICED[prev - 0x30A6];
+      return text[pos-1] != prev;
+    }
+    return false;
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKWidthFilterFactory.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/CJKWidthFilterFactory.java
@ -0,0 +1,50 @@
+package com.fr.third.org.apache.lucene.analysis.cjk;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.fr.third.org.apache.lucene.analysis.TokenStream;
+import com.fr.third.org.apache.lucene.analysis.cjk.CJKWidthFilter;
+import com.fr.third.org.apache.lucene.analysis.util.AbstractAnalysisFactory;
+import com.fr.third.org.apache.lucene.analysis.util.MultiTermAwareComponent;
+import com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/** 
+ * Factory for {@link CJKWidthFilter}.
+ * <pre class="prettyprint" >
+ * &lt;fieldType name="text_cjk" class="solr.TextField"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.CJKWidthFilterFactory"/&gt;
+ *     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
+ *     &lt;filter class="solr.CJKBigramFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ */
+
+public class CJKWidthFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
+  
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new CJKWidthFilter(input);
+  }
+  
+  @Override
+  public AbstractAnalysisFactory getMultiTermComponent() {
+    return this;
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/package.html
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cjk/package.html
@ -0,0 +1,42 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<head>
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+</head>
+<body>
+Analyzer for Chinese, Japanese, and Korean, which indexes bigrams. 
+This analyzer generates bigram terms, which are overlapping groups of two adjacent Han, Hiragana, Katakana, or Hangul characters.
+<p>
+Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
+<ul>
+	<li>ChineseAnalyzer (in the analyzers/cn package): Index unigrams (individual Chinese characters) as a token.
+	<li>CJKAnalyzer (in this package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
+	<li>SmartChineseAnalyzer (in the analyzers/smartcn package): Index words (attempt to segment Chinese text into words) as tokens.
+</ul>
+
+Example phrase： "我是中国人"
+<ol>
+	<li>ChineseAnalyzer: 我－是－中－国－人</li>
+	<li>CJKAnalyzer: 我是－是中－中国－国人</li>
+	<li>SmartChineseAnalyzer: 我－是－中国－人</li>
+</ol>
+</p>
+
+</body>
+</html>
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/ChineseAnalyzer.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/ChineseAnalyzer.java
@ -0,0 +1,50 @@
+package com.fr.third.org.apache.lucene.analysis.cn;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import com.fr.third.org.apache.lucene.analysis.standard.StandardAnalyzer; // javadoc @link
+import com.fr.third.org.apache.lucene.analysis.Analyzer;
+import com.fr.third.org.apache.lucene.analysis.Tokenizer;
+
+/**
+ * An {@link Analyzer} that tokenizes text with {@link ChineseTokenizer} and
+ * filters with {@link ChineseFilter}
+ * @deprecated (3.1) Use {@link StandardAnalyzer} instead, which has the same functionality.
+ * This analyzer will be removed in Lucene 5.0
+ */
+@Deprecated
+public final class ChineseAnalyzer extends Analyzer {
+
+  /**
+   * Creates
+   * {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+   * used to tokenize all the text in the provided {@link Reader}.
+   * 
+   * @return {@link com.fr.third.org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+   *         built from a {@link ChineseTokenizer} filtered with
+   *         {@link ChineseFilter}
+   */
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName,
+        Reader reader) {
+      final Tokenizer source = new ChineseTokenizer(reader);
+      return new TokenStreamComponents(source, new ChineseFilter(source));
+    }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/ChineseFilter.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/ChineseFilter.java
@ -0,0 +1,104 @@
+package com.fr.third.org.apache.lucene.analysis.cn;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import com.fr.third.org.apache.lucene.analysis.TokenFilter;
+import com.fr.third.org.apache.lucene.analysis.TokenStream;
+import com.fr.third.org.apache.lucene.analysis.core.StopFilter;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import com.fr.third.org.apache.lucene.analysis.util.CharArraySet;
+import com.fr.third.org.apache.lucene.util.Version;
+
+/**
+ * A {@link TokenFilter} with a stop word table.  
+ * <ul>
+ * <li>Numeric tokens are removed.
+ * <li>English tokens must be larger than 1 character.
+ * <li>One Chinese character as one Chinese word.
+ * </ul>
+ * TO DO:
+ * <ol>
+ * <li>Add Chinese stop words, such as \ue400
+ * <li>Dictionary based Chinese word extraction
+ * <li>Intelligent Chinese word extraction
+ * </ol>
+ * 
+ * @deprecated (3.1) Use {@link StopFilter} instead, which has the same functionality.
+ * This filter will be removed in Lucene 5.0
+ */
+@Deprecated
+public final class ChineseFilter extends TokenFilter {
+
+
+    // Only English now, Chinese to be added later.
+    public static final String[] STOP_WORDS = {
+    "and", "are", "as", "at", "be", "but", "by",
+    "for", "if", "in", "into", "is", "it",
+    "no", "not", "of", "on", "or", "such",
+    "that", "the", "their", "then", "there", "these",
+    "they", "this", "to", "was", "will", "with"
+    };
+
+
+    private CharArraySet stopTable;
+
+    private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    
+    public ChineseFilter(TokenStream in) {
+        super(in);
+
+        stopTable = new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS), false);
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+
+        while (input.incrementToken()) {
+            char text[] = termAtt.buffer();
+            int termLength = termAtt.length();
+
+          // why not key off token type here assuming ChineseTokenizer comes first?
+            if (!stopTable.contains(text, 0, termLength)) {
+                switch (Character.getType(text[0])) {
+
+                case Character.LOWERCASE_LETTER:
+                case Character.UPPERCASE_LETTER:
+
+                    // English word/token should larger than 1 character.
+                    if (termLength>1) {
+                        return true;
+                    }
+                    break;
+                case Character.OTHER_LETTER:
+
+                    // One Chinese character as one Chinese word.
+                    // Chinese word extraction to be added later here.
+
+                    return true;
+                }
+
+            }
+
+        }
+        return false;
+    }
+
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/ChineseFilterFactory.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/ChineseFilterFactory.java
@ -0,0 +1,36 @@
+package com.fr.third.org.apache.lucene.analysis.cn;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.fr.third.org.apache.lucene.analysis.TokenStream;
+import com.fr.third.org.apache.lucene.analysis.cn.ChineseFilter;
+import com.fr.third.org.apache.lucene.analysis.core.StopFilterFactory; // javadocs
+import com.fr.third.org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link ChineseFilter}
+ * @deprecated Use {@link StopFilterFactory} instead.
+ */
+@Deprecated
+public class ChineseFilterFactory extends TokenFilterFactory {
+  
+  public ChineseFilter create(TokenStream in) {
+    return new ChineseFilter(in);
+  }
+}
+
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/ChineseTokenizer.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/ChineseTokenizer.java
@ -0,0 +1,169 @@
+package com.fr.third.org.apache.lucene.analysis.cn;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.IOException;
+import java.io.Reader;
+
+import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizer;
+import com.fr.third.org.apache.lucene.analysis.Tokenizer;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import com.fr.third.org.apache.lucene.util.AttributeSource;
+
+
+/**
+ * Tokenize Chinese text as individual chinese characters.
+ * 
+ * <p>
+ * The difference between ChineseTokenizer and
+ * CJKTokenizer is that they have different
+ * token parsing logic.
+ * </p>
+ * <p>
+ * For example, if the Chinese text
+ * "C1C2C3C4" is to be indexed:
+ * <ul>
+ * <li>The tokens returned from ChineseTokenizer are C1, C2, C3, C4. 
+ * <li>The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.
+ * </ul>
+ * </p>
+ * <p>
+ * Therefore the index created by CJKTokenizer is much larger.
+ * </p>
+ * <p>
+ * The problem is that when searching for C1, C1C2, C1C3,
+ * C4C2, C1C2C3 ... the ChineseTokenizer works, but the
+ * CJKTokenizer will not work.
+ * </p>
+ * @deprecated (3.1) Use {@link StandardTokenizer} instead, which has the same functionality.
+ * This filter will be removed in Lucene 5.0
+ */
+@Deprecated
+public final class ChineseTokenizer extends Tokenizer {
+
+
+    public ChineseTokenizer(Reader in) {
+      super(in);
+    }
+
+    public ChineseTokenizer(AttributeSource source, Reader in) {
+      super(source, in);
+    }
+
+    public ChineseTokenizer(AttributeFactory factory, Reader in) {
+      super(factory, in);
+    }
+       
+    private int offset = 0, bufferIndex=0, dataLen=0;
+    private final static int MAX_WORD_LEN = 255;
+    private final static int IO_BUFFER_SIZE = 1024;
+    private final char[] buffer = new char[MAX_WORD_LEN];
+    private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+
+    private int length;
+    private int start;
+
+    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+    
+    private final void push(char c) {
+
+        if (length == 0) start = offset-1;            // start of token
+        buffer[length++] = Character.toLowerCase(c);  // buffer it
+
+    }
+
+    private final boolean flush() {
+
+        if (length>0) {
+            //System.out.println(new String(buffer, 0,
+            //length));
+          termAtt.copyBuffer(buffer, 0, length);
+          offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
+          return true;
+        }
+        else
+            return false;
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+        clearAttributes();
+
+        length = 0;
+        start = offset;
+
+
+        while (true) {
+
+            final char c;
+            offset++;
+
+            if (bufferIndex >= dataLen) {
+                dataLen = input.read(ioBuffer);
+                bufferIndex = 0;
+            }
+
+            if (dataLen == -1) {
+              offset--;
+              return flush();
+            } else
+                c = ioBuffer[bufferIndex++];
+
+
+            switch(Character.getType(c)) {
+
+            case Character.DECIMAL_DIGIT_NUMBER:
+            case Character.LOWERCASE_LETTER:
+            case Character.UPPERCASE_LETTER:
+                push(c);
+                if (length == MAX_WORD_LEN) return flush();
+                break;
+
+            case Character.OTHER_LETTER:
+                if (length>0) {
+                    bufferIndex--;
+                    offset--;
+                    return flush();
+                }
+                push(c);
+                return flush();
+
+            default:
+                if (length>0) return flush();
+                break;
+            }
+        }
+    }
+    
+    @Override
+    public final void end() {
+      // set final offset
+      final int finalOffset = correctOffset(offset);
+      this.offsetAtt.setOffset(finalOffset, finalOffset);
+    }
+
+    @Override
+    public void reset() throws IOException {
+      super.reset();
+      offset = bufferIndex = dataLen = 0;
+    }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/ChineseTokenizerFactory.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/ChineseTokenizerFactory.java
@ -0,0 +1,37 @@
+package com.fr.third.org.apache.lucene.analysis.cn;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import com.fr.third.org.apache.lucene.analysis.cn.ChineseTokenizer;
+import com.fr.third.org.apache.lucene.analysis.standard.StandardTokenizerFactory; // javadocs
+import com.fr.third.org.apache.lucene.analysis.util.TokenizerFactory;
+
+/** 
+ * Factory for {@link ChineseTokenizer}
+ * @deprecated Use {@link StandardTokenizerFactory} instead.
+ */
+@Deprecated
+public class ChineseTokenizerFactory extends TokenizerFactory {
+  
+  public ChineseTokenizer create(Reader in) {
+    return new ChineseTokenizer(in);
+  }
+}
+
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/package.html
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/cn/package.html
@ -0,0 +1,41 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<head>
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+</head>
+<body>
+Analyzer for Chinese, which indexes unigrams (individual chinese characters).
+<p>
+Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
+<ul>
+	<li>StandardAnalyzer: Index unigrams (individual Chinese characters) as a token.
+	<li>CJKAnalyzer (in the analyzers/cjk package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
+	<li>SmartChineseAnalyzer (in the analyzers/smartcn package): Index words (attempt to segment Chinese text into words) as tokens.
+</ul>
+
+Example phrase： "我是中国人"
+<ol>
+	<li>StandardAnalyzer: 我－是－中－国－人</li>
+	<li>CJKAnalyzer: 我是－是中－中国－国人</li>
+	<li>SmartChineseAnalyzer: 我－是－中国－人</li>
+</ol>
+</p>
+
+</body>
+</html>
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/commongrams/CommonGramsFilter.java
@ -0,0 +1,176 @@
+/*
+ * Licensed under the Apache License, 
+ * Version 2.0 (the "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software distributed under the License 
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+ * See the License for the specific language governing permissions and limitations under the License. 
+ */
+
+package com.fr.third.org.apache.lucene.analysis.commongrams;
+
+import java.io.IOException;
+
+import com.fr.third.org.apache.lucene.analysis.TokenFilter;
+import com.fr.third.org.apache.lucene.analysis.TokenStream;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import com.fr.third.org.apache.lucene.analysis.util.CharArraySet;
+import com.fr.third.org.apache.lucene.util.Version;
+
+/*
+ * TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors 
+ */
+
+/**
+ * Construct bigrams for frequently occurring terms while indexing. Single terms
+ * are still indexed too, with bigrams overlaid. This is achieved through the
+ * use of {@link PositionIncrementAttribute#setPositionIncrement(int)}. Bigrams have a type
+ * of {@link #GRAM_TYPE} Example:
+ * <ul>
+ * <li>input:"the quick brown fox"</li>
+ * <li>output:|"the","the-quick"|"brown"|"fox"|</li>
+ * <li>"the-quick" has a position increment of 0 so it is in the same position
+ * as "the" "the-quick" has a term.type() of "gram"</li>
+ * 
+ * </ul>
+ */
+
+/*
+ * Constructors and makeCommonSet based on similar code in StopFilter
+ */
+public final class CommonGramsFilter extends TokenFilter {
+
+  public static final String GRAM_TYPE = "gram";
+  private static final char SEPARATOR = '_';
+
+  private final CharArraySet commonWords;
+
+  private final StringBuilder buffer = new StringBuilder();
+  
+  private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
+  private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
+  private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class);
+
+  private int lastStartOffset;
+  private boolean lastWasCommon;
+  private State savedState;
+
+  /**
+   * Construct a token stream filtering the given input using a Set of common
+   * words to create bigrams. Outputs both unigrams with position increment and
+   * bigrams with position increment 0 type=gram where one or both of the words
+   * in a potential bigram are in the set of common words .
+   * 
+   * @param input TokenStream input in filter chain
+   * @param commonWords The set of common words.
+   */
+  public CommonGramsFilter(Version matchVersion, TokenStream input, CharArraySet commonWords) {
+    super(input);
+    this.commonWords = commonWords;
+  }
+
+  /**
+   * Inserts bigrams for common words into a token stream. For each input token,
+   * output the token. If the token and/or the following token are in the list
+   * of common words also output a bigram with position increment 0 and
+   * type="gram"
+   *
+   * TODO:Consider adding an option to not emit unigram stopwords
+   * as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be
+   * changed to work with this.
+   *
+   * TODO: Consider optimizing for the case of three
+   * commongrams i.e "man of the year" normally produces 3 bigrams: "man-of",
+   * "of-the", "the-year" but with proper management of positions we could
+   * eliminate the middle bigram "of-the"and save a disk seek and a whole set of
+   * position lookups.
+   */
+  @Override
+  public boolean incrementToken() throws IOException {
+    // get the next piece of input
+    if (savedState != null) {
+      restoreState(savedState);
+      savedState = null;
+      saveTermBuffer();
+      return true;
+    } else if (!input.incrementToken()) {
+        return false;
+    }
+    
+    /* We build n-grams before and after stopwords. 
+     * When valid, the buffer always contains at least the separator.
+     * If its empty, there is nothing before this stopword.
+     */
+    if (lastWasCommon || (isCommon() && buffer.length() > 0)) {
+      savedState = captureState();
+      gramToken();
+      return true;      
+    }
+
+    saveTermBuffer();
+    return true;
+  }
+
+  /**
+   * {@inheritDoc}
+   */
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    lastWasCommon = false;
+    savedState = null;
+    buffer.setLength(0);
+  }
+
+  // ================================================= Helper Methods ================================================
+
+  /**
+   * Determines if the current token is a common term
+   *
+   * @return {@code true} if the current token is a common term, {@code false} otherwise
+   */
+  private boolean isCommon() {
+    return commonWords != null && commonWords.contains(termAttribute.buffer(), 0, termAttribute.length());
+  }
+
+  /**
+   * Saves this information to form the left part of a gram
+   */
+  private void saveTermBuffer() {
+    buffer.setLength(0);
+    buffer.append(termAttribute.buffer(), 0, termAttribute.length());
+    buffer.append(SEPARATOR);
+    lastStartOffset = offsetAttribute.startOffset();
+    lastWasCommon = isCommon();
+  }
+
+  /**
+   * Constructs a compound token.
+   */
+  private void gramToken() {
+    buffer.append(termAttribute.buffer(), 0, termAttribute.length());
+    int endOffset = offsetAttribute.endOffset();
+
+    clearAttributes();
+
+    int length = buffer.length();
+    char termText[] = termAttribute.buffer();
+    if (length > termText.length) {
+      termText = termAttribute.resizeBuffer(length);
+    }
+    
+    buffer.getChars(0, length, termText, 0);
+    termAttribute.setLength(length);
+    posIncAttribute.setPositionIncrement(0);
+    posLenAttribute.setPositionLength(2); // bigram
+    offsetAttribute.setOffset(lastStartOffset, endOffset);
+    typeAttribute.setType(GRAM_TYPE);
+    buffer.setLength(0);
+  }
+}
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java
@ -0,0 +1,79 @@
+package com.fr.third.org.apache.lucene.analysis.commongrams;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import com.fr.third.org.apache.lucene.analysis.TokenStream;
+import com.fr.third.org.apache.lucene.analysis.commongrams.CommonGramsFilter;
+import com.fr.third.org.apache.lucene.analysis.core.StopAnalyzer;
+import com.fr.third.org.apache.lucene.analysis.util.*;
+
+/**
+ * Constructs a {@link CommonGramsFilter}.
+ * <pre class="prettyprint" >
+ * &lt;fieldType name="text_cmmngrms" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.CommonGramsFilterFactory" words="commongramsstopwords.txt" ignoreCase="false"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ *
+ */
+
+/*
+ * This is pretty close to a straight copy from StopFilterFactory
+ */
+public class CommonGramsFilterFactory extends TokenFilterFactory implements
+    ResourceLoaderAware {
+
+  public void inform(ResourceLoader loader) throws IOException {
+    String commonWordFiles = args.get("words");
+    ignoreCase = getBoolean("ignoreCase", false);
+
+    if (commonWordFiles != null) {
+      if ("snowball".equalsIgnoreCase(args.get("format"))) {
+        commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
+      } else {
+        commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
+      }
+    } else {
+      commonWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
+    }
+  }
+      
+    //Force the use of a char array set, as it is the most performant, although this may break things if Lucene ever goes away from it.  See SOLR-1095
+    private CharArraySet commonWords;
+    private boolean ignoreCase;
+
+  public boolean isIgnoreCase() {
+    return ignoreCase;
+  }
+
+  public CharArraySet getCommonWords() {
+    return commonWords;
+  }
+
+  public CommonGramsFilter create(TokenStream input) {
+    CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords);
+    return commonGrams;
+  }
+}
+ 
+  
+  
--- a/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilter.java
+++ b/fine-lucene/src/com/fr/third/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilter.java
@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.fr.third.org.apache.lucene.analysis.commongrams;
+
+import java.io.IOException;
+
+import com.fr.third.org.apache.lucene.analysis.TokenFilter;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import com.fr.third.org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+
+import static com.fr.third.org.apache.lucene.analysis.commongrams.CommonGramsFilter.GRAM_TYPE;
+
+/**
+ * Wrap a CommonGramsFilter optimizing phrase queries by only returning single
+ * words when they are not a member of a bigram.
+ * 
+ * Example:
+ * <ul>
+ * <li>query input to CommonGramsFilter: "the rain in spain falls mainly"
+ * <li>output of CommomGramsFilter/input to CommonGramsQueryFilter:
+ * |"the, "the-rain"|"rain" "rain-in"|"in, "in-spain"|"spain"|"falls"|"mainly"
+ * <li>output of CommonGramsQueryFilter:"the-rain", "rain-in" ,"in-spain",
+ * "falls", "mainly"
+ * </ul>
+ */
+
+/*
+ * See:http://hudson.zones.apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache/lucene/analysis/TokenStream.html and
+ * http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/analysis/package.html?revision=718798
+ */
+public final class CommonGramsQueryFilter extends TokenFilter {
+
+  private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
+  private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
+  
+  private State previous;
+  private String previousType;
+  private boolean exhausted;
+
+  /**
+   * Constructs a new CommonGramsQueryFilter based on the provided CommomGramsFilter 
+   * 
+   * @param input CommonGramsFilter the QueryFilter will use
+   */
+  public CommonGramsQueryFilter(CommonGramsFilter input) {
+    super(input);
+  }
+
+  /**
+   * {@inheritDoc}
+   */
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    previous = null;
+    previousType = null;
+    exhausted = false;
+  }
+  
+  /**
+   * Output bigrams whenever possible to optimize queries. Only output unigrams
+   * when they are not a member of a bigram. Example:
+   * <ul>
+   * <li>input: "the rain in spain falls mainly"
+   * <li>output:"the-rain", "rain-in" ,"in-spain", "falls", "mainly"
+   * </ul>
+   */
+  @Override
+  public boolean incrementToken() throws IOException {
+    while (!exhausted && input.incrementToken()) {
+      State current = captureState();
+
+      if (previous != null && !isGramType()) {
+        restoreState(previous);
+        previous = current;
+        previousType = typeAttribute.type();
+        
+        if (isGramType()) {
+          posIncAttribute.setPositionIncrement(1);
+        }
+        return true;
+      }
+
+      previous = current;
+    }
+
+    exhausted = true;
+
+    if (previous == null || GRAM_TYPE.equals(previousType)) {
+      return false;
+    }
+    
+    restoreState(previous);
+    previous = null;
+    
+    if (isGramType()) {
+      posIncAttribute.setPositionIncrement(1);
+    }
+    return true;
+  }
+
+  // ================================================= Helper Methods ================================================
+
+  /**
+   * Convenience method to check if the current type is a gram type
+   * 
+   * @return {@code true} if the current type is a gram type, {@code false} otherwise
+   */
+  public boolean isGramType() {
+    return GRAM_TYPE.equals(typeAttribute.type());
+  }
+}
--- a/Show More
+++ b/Show More