对文本进行分词,文本进行分词,公司项目,依赖IKAna
分享于 点击 47700 次 点评:266
对文本进行分词,文本进行分词,公司项目,依赖IKAna
公司项目,依赖IKAnalyzer.jar 的org.mira.lucene.analysis.f类以及分词词典
针对业务需求写了一个TokenFilter,去掉所有含有数字、英文、一个汉字的token
//分析器public class DydAnalyzer extends Analyzer{ public final TokenStream tokenStream(String paramString, Reader reader){ TokenStream ts = new f(reader); ts = new DydFilter(ts); return ts; }}//过滤器public class DydFilter extends TokenFilter{ private int min = 2; private int max = 10; protected DydFilter(TokenStream input) { super(input); } @SuppressWarnings("deprecation") @Override public Token next(Token reusableToken) throws IOException { assert (reusableToken != null); for (Token nextToken = this.input.next(reusableToken); nextToken != null; nextToken = this.input.next(reusableToken)){ String word = nextToken.termText(); int len = nextToken.termLength(); if (checkLength(len) && !checkNum(word) && !checkEN(word)) { return nextToken; } } return null; } /** * 判断长度 * @param len * @return */ private boolean checkLength (int len) { if ((len >= this.min) && (len <= this.max)) return true; return false; } /** * 去掉数字 * @param word * @return */ private boolean checkNum (String word) { if (RegexUtils.matchNum(word)) return true; return false; } /** * 去掉英文 * @param word * @return */ private boolean checkEN (String word ) { if (RegexUtils.matchEN(word)) return true; return false; }}//该片段来自于http://byrx.net
用户点评