From bea37293ad6abe468c40a28d029f56c14177283c Mon Sep 17 00:00:00 2001 From: Looly Date: Tue, 3 Sep 2019 19:18:39 +0800 Subject: [PATCH] add mynlp --- CHANGELOG.md | 1 + hutool-extra/pom.xml | 6 +++ .../tokenizer/engine/TokenizerFactory.java | 6 +++ .../tokenizer/engine/mynlp/MynlpEngine.java | 44 ++++++++++++++++ .../tokenizer/engine/mynlp/MynlpResult.java | 50 +++++++++++++++++++ .../tokenizer/engine/mynlp/MynlpWord.java | 45 +++++++++++++++++ .../tokenizer/engine/mynlp/package-info.java | 8 +++ .../extra/tokenizer/TokenizerUtilTest.java | 12 +++++ pom.xml | 1 + 9 files changed, 173 insertions(+) create mode 100644 hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpEngine.java create mode 100644 hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpResult.java create mode 100644 hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpWord.java create mode 100644 hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/package-info.java diff --git a/CHANGELOG.md b/CHANGELOG.md index 916243740..2c6215634 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ * 【extra】 Sftp得put方法增加进度支持(issue#518@Github) * 【core】 ArrayUtil增加distinct方法 * 【http】 去除log模块依赖,Cookie中去除日志提示,body方法传入JSON对象废弃,未来移除json模块依赖 +* 【extra】 添加MyNLP支持(issue#519@Github) ### Bug修复 diff --git a/hutool-extra/pom.xml b/hutool-extra/pom.xml index 038948df5..c885011bd 100644 --- a/hutool-extra/pom.xml +++ b/hutool-extra/pom.xml @@ -200,5 +200,11 @@ 1.2 true + + com.mayabot.mynlp + mynlp-segment + 3.0.0 + true + diff --git a/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/TokenizerFactory.java b/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/TokenizerFactory.java index 5309ee384..9e0ef1a6b 100644 --- a/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/TokenizerFactory.java +++ b/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/TokenizerFactory.java @@ -10,6 +10,7 @@ import cn.hutool.extra.tokenizer.engine.ikanalyzer.IKAnalyzerEngine; import cn.hutool.extra.tokenizer.engine.jcseg.JcsegEngine; import cn.hutool.extra.tokenizer.engine.jieba.JiebaEngine; import cn.hutool.extra.tokenizer.engine.mmseg.MmsegEngine; +import cn.hutool.extra.tokenizer.engine.mynlp.MynlpEngine; import cn.hutool.extra.tokenizer.engine.word.WordEngine; import cn.hutool.log.StaticLog; @@ -77,6 +78,11 @@ public class TokenizerFactory { } catch (NoClassDefFoundError e) { // ignore } + try { + return new MynlpEngine(); + } catch (NoClassDefFoundError e) { + // ignore + } throw new TokenizerException("No tokenizer found ! Please add some tokenizer jar to your project !"); } } diff --git a/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpEngine.java b/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpEngine.java new file mode 100644 index 000000000..640a7defc --- /dev/null +++ b/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpEngine.java @@ -0,0 +1,44 @@ +package cn.hutool.extra.tokenizer.engine.mynlp; + +import com.mayabot.nlp.segment.Lexer; +import com.mayabot.nlp.segment.Lexers; +import com.mayabot.nlp.segment.Sentence; + +import cn.hutool.core.util.StrUtil; +import cn.hutool.extra.tokenizer.Result; +import cn.hutool.extra.tokenizer.TokenizerEngine; + +/** + * MYNLP 中文NLP工具包分词实现
+ * 项目地址:https://github.com/mayabot/mynlp/ + * + * @author looly + * + */ +public class MynlpEngine implements TokenizerEngine { + + private Lexer lexer; + + /** + * 构造 + */ + public MynlpEngine() { + this.lexer = Lexers.core(); + } + + /** + * 构造 + * + * @param lexer 分词器接口{@link Lexer} + */ + public MynlpEngine(Lexer lexer) { + this.lexer = lexer; + } + + @Override + public Result parse(CharSequence text) { + final Sentence sentence = this.lexer.scan(StrUtil.str(text)); + return new MynlpResult(sentence); + } + +} diff --git a/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpResult.java b/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpResult.java new file mode 100644 index 000000000..5fc0f73ea --- /dev/null +++ b/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpResult.java @@ -0,0 +1,50 @@ +package cn.hutool.extra.tokenizer.engine.mynlp; + +import java.util.Iterator; + +import com.mayabot.nlp.segment.Sentence; +import com.mayabot.nlp.segment.WordTerm; + +import cn.hutool.extra.tokenizer.Result; +import cn.hutool.extra.tokenizer.Word; + +/** + * MYNLP 中文NLP工具包分词结果实现
+ * 项目地址:https://github.com/mayabot/mynlp/ + * + * @author looly + * + */ +public class MynlpResult implements Result { + + private Iterator result; + + /** + * 构造 + * + * @param sentence 分词结果(中文句子) + */ + public MynlpResult(Sentence sentence) { + this.result = sentence.iterator(); + } + + @Override + public boolean hasNext() { + return result.hasNext(); + } + + @Override + public Word next() { + return new MynlpWord(result.next()); + } + + @Override + public void remove() { + result.remove(); + } + + @Override + public Iterator iterator() { + return this; + } +} diff --git a/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpWord.java b/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpWord.java new file mode 100644 index 000000000..a273c75dd --- /dev/null +++ b/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/MynlpWord.java @@ -0,0 +1,45 @@ +package cn.hutool.extra.tokenizer.engine.mynlp; + +import com.mayabot.nlp.segment.WordTerm; + +import cn.hutool.extra.tokenizer.Word; + +/** + * mmseg分词中的一个单词包装 + * + * @author looly + * + */ +public class MynlpWord implements Word { + + private WordTerm word; + + /** + * 构造 + * + * @param word {@link WordTerm} + */ + public MynlpWord(WordTerm word) { + this.word = word; + } + + @Override + public String getText() { + return word.getWord(); + } + + @Override + public int getStartOffset() { + return this.word.offset; + } + + @Override + public int getEndOffset() { + return getStartOffset() + word.word.length(); + } + + @Override + public String toString() { + return getText(); + } +} diff --git a/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/package-info.java b/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/package-info.java new file mode 100644 index 000000000..9d528f00a --- /dev/null +++ b/hutool-extra/src/main/java/cn/hutool/extra/tokenizer/engine/mynlp/package-info.java @@ -0,0 +1,8 @@ +/** + * MYNLP 中文NLP工具包分词实现
+ * 项目地址:https://github.com/mayabot/mynlp/ + * + * @author Looly + * @since 4.6.5 + */ +package cn.hutool.extra.tokenizer.engine.mynlp; \ No newline at end of file diff --git a/hutool-extra/src/test/java/cn/hutool/extra/tokenizer/TokenizerUtilTest.java b/hutool-extra/src/test/java/cn/hutool/extra/tokenizer/TokenizerUtilTest.java index cc752035a..3b061cab5 100644 --- a/hutool-extra/src/test/java/cn/hutool/extra/tokenizer/TokenizerUtilTest.java +++ b/hutool-extra/src/test/java/cn/hutool/extra/tokenizer/TokenizerUtilTest.java @@ -3,6 +3,7 @@ package cn.hutool.extra.tokenizer; import java.util.Iterator; import org.junit.Assert; +import org.junit.Ignore; import org.junit.Test; import cn.hutool.core.collection.CollUtil; @@ -12,6 +13,7 @@ import cn.hutool.extra.tokenizer.engine.ikanalyzer.IKAnalyzerEngine; import cn.hutool.extra.tokenizer.engine.jcseg.JcsegEngine; import cn.hutool.extra.tokenizer.engine.jieba.JiebaEngine; import cn.hutool.extra.tokenizer.engine.mmseg.MmsegEngine; +import cn.hutool.extra.tokenizer.engine.mynlp.MynlpEngine; import cn.hutool.extra.tokenizer.engine.word.WordEngine; /** @@ -86,6 +88,16 @@ public class TokenizerUtilTest { Assert.assertEquals("这两个 方法 的 区别 在于 返回值", resultStr); } + @Test + @Ignore + public void mynlpTest() { + // 此单元测试需要JDK8,默认忽略 + TokenizerEngine engine = new MynlpEngine(); + Result result = engine.parse(text); + String resultStr = CollUtil.join((Iterator)result, " "); + Assert.assertEquals("这 两个 方法 的 区别 在于 返回 值", resultStr); + } + private void checkResult(Result result) { String resultStr = CollUtil.join((Iterator)result, " "); Assert.assertEquals("这 两个 方法 的 区别 在于 返回 值", resultStr); diff --git a/pom.xml b/pom.xml index a93108a31..efa60d0b1 100644 --- a/pom.xml +++ b/pom.xml @@ -89,6 +89,7 @@ ${compile.version} ${compile.version} + true