From 050021912b4cfab3148ae5c1505e3d74945d29f7 Mon Sep 17 00:00:00 2001 From: Looly Date: Thu, 14 Dec 2023 19:14:18 +0800 Subject: [PATCH] fix thread safe --- .../tokenizer/engine/TokenizerEngine.java | 4 +-- .../tokenizer/engine/hanlp/HanLPEngine.java | 5 ++- .../engine/ikanalyzer/IKAnalyzerEngine.java | 33 ++++++++++++------- .../tokenizer/engine/jcseg/JcsegEngine.java | 28 ++++++++-------- .../tokenizer/engine/jieba/JiebaEngine.java | 1 + .../tokenizer/engine/mmseg/MmsegEngine.java | 21 ++++++------ .../tokenizer/engine/mynlp/MynlpEngine.java | 4 +-- .../tokenizer/engine/word/WordEngine.java | 3 +- 8 files changed, 55 insertions(+), 44 deletions(-) diff --git a/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/TokenizerEngine.java b/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/TokenizerEngine.java index d6e097d0b..c0a780585 100644 --- a/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/TokenizerEngine.java +++ b/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/TokenizerEngine.java @@ -15,10 +15,10 @@ package org.dromara.hutool.extra.tokenizer.engine; import org.dromara.hutool.extra.tokenizer.Result; /** - * 分词引擎接口定义,用户通过实现此接口完成特定分词引擎的适配 + * 分词引擎接口定义,用户通过实现此接口完成特定分词引擎的适配
+ * 由于引擎使用单例模式,因此要求实现类保证线程安全 * * @author looly - * */ public interface TokenizerEngine { diff --git a/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/hanlp/HanLPEngine.java b/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/hanlp/HanLPEngine.java index 7b48e2f3c..e3bdc7ad0 100644 --- a/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/hanlp/HanLPEngine.java +++ b/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/hanlp/HanLPEngine.java @@ -21,10 +21,10 @@ import org.dromara.hutool.extra.tokenizer.Result; /** * HanLP分词引擎实现
- * 项目地址:https://github.com/hankcs/HanLP + * 项目地址:https://github.com/hankcs/HanLP
+ * {@link Segment#seg(String)}方法线程安全 * * @author looly - * */ public class HanLPEngine implements TokenizerEngine { @@ -32,7 +32,6 @@ public class HanLPEngine implements TokenizerEngine { /** * 构造 - * */ public HanLPEngine() { this(HanLP.newSegment()); diff --git a/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/ikanalyzer/IKAnalyzerEngine.java b/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/ikanalyzer/IKAnalyzerEngine.java index 337ec2321..59f67892a 100644 --- a/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/ikanalyzer/IKAnalyzerEngine.java +++ b/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/ikanalyzer/IKAnalyzerEngine.java @@ -12,6 +12,8 @@ package org.dromara.hutool.extra.tokenizer.engine.ikanalyzer; +import org.wltea.analyzer.cfg.Configuration; +import org.wltea.analyzer.cfg.DefaultConfig; import org.wltea.analyzer.core.IKSegmenter; import org.dromara.hutool.core.text.StrUtil; @@ -20,35 +22,44 @@ import org.dromara.hutool.extra.tokenizer.Result; /** * IKAnalyzer分词引擎实现
- * 项目地址:https://github.com/yozhao/IKAnalyzer + * 项目地址:https://github.com/yozhao/IKAnalyzer
+ * {@link IKSegmenter} 非线程全,因此每次单独创建对象 * * @author looly - * */ public class IKAnalyzerEngine implements TokenizerEngine { - private final IKSegmenter seg; + private final Configuration cfg; /** * 构造 - * */ public IKAnalyzerEngine() { - this(new IKSegmenter(null, true)); + this(createDefaultConfig()); } /** * 构造 - * - * @param seg {@link IKSegmenter} + * @param cfg 配置 */ - public IKAnalyzerEngine(final IKSegmenter seg) { - this.seg = seg; + public IKAnalyzerEngine(final Configuration cfg) { + cfg.setUseSmart(true); + this.cfg = cfg; } @Override public Result parse(final CharSequence text) { - this.seg.reset(StrUtil.getReader(text)); - return new IKAnalyzerResult(this.seg); + final IKSegmenter seg = new IKSegmenter(StrUtil.getReader(text), cfg); + return new IKAnalyzerResult(seg); + } + + /** + * 创建默认配置 + * @return {@link Configuration} + */ + private static Configuration createDefaultConfig(){ + final Configuration configuration = DefaultConfig.getInstance(); + configuration.setUseSmart(true); + return configuration; } } diff --git a/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/jcseg/JcsegEngine.java b/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/jcseg/JcsegEngine.java index 41a277ff8..cb67d6a05 100644 --- a/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/jcseg/JcsegEngine.java +++ b/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/jcseg/JcsegEngine.java @@ -26,45 +26,45 @@ import java.io.StringReader; /** * Jcseg分词引擎实现
- * 项目地址:https://gitee.com/lionsoul/jcseg + * 项目地址:https://gitee.com/lionsoul/jcseg
+ * {@link ISegment}非线程安全,每次单独创建 * * @author looly - * */ public class JcsegEngine implements TokenizerEngine { - private final ISegment segment; + private final SegmenterConfig config; + private final ADictionary dic; /** * 构造 */ public JcsegEngine() { // 创建SegmenterConfig分词配置实例,自动查找加载jcseg.properties配置项来初始化 - final SegmenterConfig config = new SegmenterConfig(true); - // 创建默认单例词库实现,并且按照config配置加载词库 - final ADictionary dic = DictionaryFactory.createSingletonDictionary(config); - - // 依据给定的ADictionary和SegmenterConfig来创建ISegment - this.segment = ISegment.COMPLEX.factory.create(config, dic); + this(new SegmenterConfig(true)); } /** * 构造 * - * @param segment {@link ISegment} + * @param config {@link SegmenterConfig} */ - public JcsegEngine(final ISegment segment) { - this.segment = segment; + public JcsegEngine(final SegmenterConfig config) { + this.config = config; + // 创建默认单例词库实现,并且按照config配置加载词库 + this.dic = DictionaryFactory.createSingletonDictionary(config); } @Override public Result parse(final CharSequence text) { + // 依据给定的ADictionary和SegmenterConfig来创建ISegment + final ISegment segment = ISegment.COMPLEX.factory.create(config, dic); try { - this.segment.reset(new StringReader(StrUtil.str(text))); + segment.reset(new StringReader(StrUtil.str(text))); } catch (final IOException e) { throw new TokenizerException(e); } - return new JcsegResult(this.segment); + return new JcsegResult(segment); } } diff --git a/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/jieba/JiebaEngine.java b/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/jieba/JiebaEngine.java index 0e5d9f859..b5a74cac6 100644 --- a/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/jieba/JiebaEngine.java +++ b/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/jieba/JiebaEngine.java @@ -22,6 +22,7 @@ import org.dromara.hutool.extra.tokenizer.Result; /** * Jieba分词引擎实现
* 项目地址:https://github.com/huaban/jieba-analysis + * {@link JiebaSegmenter#process(String, SegMode)} 线程安全 * * @author looly * diff --git a/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/mmseg/MmsegEngine.java b/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/mmseg/MmsegEngine.java index 047edf29f..4999d63f0 100644 --- a/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/mmseg/MmsegEngine.java +++ b/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/mmseg/MmsegEngine.java @@ -12,6 +12,7 @@ package org.dromara.hutool.extra.tokenizer.engine.mmseg; +import com.chenlb.mmseg4j.Seg; import org.dromara.hutool.core.text.StrUtil; import org.dromara.hutool.extra.tokenizer.Result; import org.dromara.hutool.extra.tokenizer.engine.TokenizerEngine; @@ -23,37 +24,35 @@ import java.io.StringReader; /** * mmseg4j分词引擎实现
- * 项目地址:https://github.com/chenlb/mmseg4j-core + * 项目地址:https://github.com/chenlb/mmseg4j-core
+ * {@link MMSeg}非线程安全,故单独创建之 * * @author looly - * */ public class MmsegEngine implements TokenizerEngine { - private final MMSeg mmSeg; + private final Seg seg; /** * 构造 */ public MmsegEngine() { - final Dictionary dict = Dictionary.getInstance(); - final ComplexSeg seg = new ComplexSeg(dict); - this.mmSeg = new MMSeg(new StringReader(""), seg); + this(new ComplexSeg(Dictionary.getInstance())); } /** * 构造 * - * @param mmSeg 模式{@link MMSeg} + * @param seg 模式{@link Seg} */ - public MmsegEngine(final MMSeg mmSeg) { - this.mmSeg = mmSeg; + public MmsegEngine(final Seg seg) { + this.seg = seg; } @Override public Result parse(final CharSequence text) { - this.mmSeg.reset(StrUtil.getReader(text)); - return new MmsegResult(this.mmSeg); + final MMSeg mmSeg = new MMSeg(StrUtil.getReader(text), seg); + return new MmsegResult(mmSeg); } } diff --git a/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/mynlp/MynlpEngine.java b/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/mynlp/MynlpEngine.java index c3a861177..7fca5d9b4 100644 --- a/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/mynlp/MynlpEngine.java +++ b/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/mynlp/MynlpEngine.java @@ -21,10 +21,10 @@ import org.dromara.hutool.extra.tokenizer.engine.TokenizerEngine; /** * MYNLP 中文NLP工具包分词实现
- * 项目地址:https://github.com/mayabot/mynlp/ + * 项目地址:https://github.com/mayabot/mynlp/
+ * {@link Lexer} 线程安全 * * @author looly - * */ public class MynlpEngine implements TokenizerEngine { diff --git a/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/word/WordEngine.java b/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/word/WordEngine.java index 497c5dde7..71722c99a 100644 --- a/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/word/WordEngine.java +++ b/hutool-extra/src/main/java/org/dromara/hutool/extra/tokenizer/engine/word/WordEngine.java @@ -22,7 +22,8 @@ import org.dromara.hutool.extra.tokenizer.engine.TokenizerEngine; /** * Word分词引擎实现
- * 项目地址:https://github.com/ysc/word + * 项目地址:https://github.com/ysc/word
+ * {@link Segmentation} 线程安全 * * @author looly *