fix thread safe

This commit is contained in:
Looly 2023-12-14 19:14:18 +08:00
parent 9a5fd52e9f
commit 050021912b
8 changed files with 55 additions and 44 deletions

View File

@ -15,10 +15,10 @@ package org.dromara.hutool.extra.tokenizer.engine;
import org.dromara.hutool.extra.tokenizer.Result; import org.dromara.hutool.extra.tokenizer.Result;
/** /**
* 分词引擎接口定义用户通过实现此接口完成特定分词引擎的适配 * 分词引擎接口定义用户通过实现此接口完成特定分词引擎的适配<br>
* 由于引擎使用单例模式因此要求实现类保证线程安全
* *
* @author looly * @author looly
*
*/ */
public interface TokenizerEngine { public interface TokenizerEngine {

View File

@ -21,10 +21,10 @@ import org.dromara.hutool.extra.tokenizer.Result;
/** /**
* HanLP分词引擎实现<br> * HanLP分词引擎实现<br>
* 项目地址https://github.com/hankcs/HanLP * 项目地址https://github.com/hankcs/HanLP<br>
* {@link Segment#seg(String)}方法线程安全
* *
* @author looly * @author looly
*
*/ */
public class HanLPEngine implements TokenizerEngine { public class HanLPEngine implements TokenizerEngine {
@ -32,7 +32,6 @@ public class HanLPEngine implements TokenizerEngine {
/** /**
* 构造 * 构造
*
*/ */
public HanLPEngine() { public HanLPEngine() {
this(HanLP.newSegment()); this(HanLP.newSegment());

View File

@ -12,6 +12,8 @@
package org.dromara.hutool.extra.tokenizer.engine.ikanalyzer; package org.dromara.hutool.extra.tokenizer.engine.ikanalyzer;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.cfg.DefaultConfig;
import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.IKSegmenter;
import org.dromara.hutool.core.text.StrUtil; import org.dromara.hutool.core.text.StrUtil;
@ -20,35 +22,44 @@ import org.dromara.hutool.extra.tokenizer.Result;
/** /**
* IKAnalyzer分词引擎实现<br> * IKAnalyzer分词引擎实现<br>
* 项目地址https://github.com/yozhao/IKAnalyzer * 项目地址https://github.com/yozhao/IKAnalyzer<br>
* {@link IKSegmenter} 非线程全因此每次单独创建对象
* *
* @author looly * @author looly
*
*/ */
public class IKAnalyzerEngine implements TokenizerEngine { public class IKAnalyzerEngine implements TokenizerEngine {
private final IKSegmenter seg; private final Configuration cfg;
/** /**
* 构造 * 构造
*
*/ */
public IKAnalyzerEngine() { public IKAnalyzerEngine() {
this(new IKSegmenter(null, true)); this(createDefaultConfig());
} }
/** /**
* 构造 * 构造
* * @param cfg 配置
* @param seg {@link IKSegmenter}
*/ */
public IKAnalyzerEngine(final IKSegmenter seg) { public IKAnalyzerEngine(final Configuration cfg) {
this.seg = seg; cfg.setUseSmart(true);
this.cfg = cfg;
} }
@Override @Override
public Result parse(final CharSequence text) { public Result parse(final CharSequence text) {
this.seg.reset(StrUtil.getReader(text)); final IKSegmenter seg = new IKSegmenter(StrUtil.getReader(text), cfg);
return new IKAnalyzerResult(this.seg); return new IKAnalyzerResult(seg);
}
/**
* 创建默认配置
* @return {@link Configuration}
*/
private static Configuration createDefaultConfig(){
final Configuration configuration = DefaultConfig.getInstance();
configuration.setUseSmart(true);
return configuration;
} }
} }

View File

@ -26,45 +26,45 @@ import java.io.StringReader;
/** /**
* Jcseg分词引擎实现<br> * Jcseg分词引擎实现<br>
* 项目地址https://gitee.com/lionsoul/jcseg * 项目地址https://gitee.com/lionsoul/jcseg<br>
* {@link ISegment}非线程安全每次单独创建
* *
* @author looly * @author looly
*
*/ */
public class JcsegEngine implements TokenizerEngine { public class JcsegEngine implements TokenizerEngine {
private final ISegment segment; private final SegmenterConfig config;
private final ADictionary dic;
/** /**
* 构造 * 构造
*/ */
public JcsegEngine() { public JcsegEngine() {
// 创建SegmenterConfig分词配置实例自动查找加载jcseg.properties配置项来初始化 // 创建SegmenterConfig分词配置实例自动查找加载jcseg.properties配置项来初始化
final SegmenterConfig config = new SegmenterConfig(true); this(new SegmenterConfig(true));
// 创建默认单例词库实现并且按照config配置加载词库
final ADictionary dic = DictionaryFactory.createSingletonDictionary(config);
// 依据给定的ADictionary和SegmenterConfig来创建ISegment
this.segment = ISegment.COMPLEX.factory.create(config, dic);
} }
/** /**
* 构造 * 构造
* *
* @param segment {@link ISegment} * @param config {@link SegmenterConfig}
*/ */
public JcsegEngine(final ISegment segment) { public JcsegEngine(final SegmenterConfig config) {
this.segment = segment; this.config = config;
// 创建默认单例词库实现并且按照config配置加载词库
this.dic = DictionaryFactory.createSingletonDictionary(config);
} }
@Override @Override
public Result parse(final CharSequence text) { public Result parse(final CharSequence text) {
// 依据给定的ADictionary和SegmenterConfig来创建ISegment
final ISegment segment = ISegment.COMPLEX.factory.create(config, dic);
try { try {
this.segment.reset(new StringReader(StrUtil.str(text))); segment.reset(new StringReader(StrUtil.str(text)));
} catch (final IOException e) { } catch (final IOException e) {
throw new TokenizerException(e); throw new TokenizerException(e);
} }
return new JcsegResult(this.segment); return new JcsegResult(segment);
} }
} }

View File

@ -22,6 +22,7 @@ import org.dromara.hutool.extra.tokenizer.Result;
/** /**
* Jieba分词引擎实现<br> * Jieba分词引擎实现<br>
* 项目地址https://github.com/huaban/jieba-analysis * 项目地址https://github.com/huaban/jieba-analysis
* {@link JiebaSegmenter#process(String, SegMode)} 线程安全
* *
* @author looly * @author looly
* *

View File

@ -12,6 +12,7 @@
package org.dromara.hutool.extra.tokenizer.engine.mmseg; package org.dromara.hutool.extra.tokenizer.engine.mmseg;
import com.chenlb.mmseg4j.Seg;
import org.dromara.hutool.core.text.StrUtil; import org.dromara.hutool.core.text.StrUtil;
import org.dromara.hutool.extra.tokenizer.Result; import org.dromara.hutool.extra.tokenizer.Result;
import org.dromara.hutool.extra.tokenizer.engine.TokenizerEngine; import org.dromara.hutool.extra.tokenizer.engine.TokenizerEngine;
@ -23,37 +24,35 @@ import java.io.StringReader;
/** /**
* mmseg4j分词引擎实现<br> * mmseg4j分词引擎实现<br>
* 项目地址https://github.com/chenlb/mmseg4j-core * 项目地址https://github.com/chenlb/mmseg4j-core<br>
* {@link MMSeg}非线程安全故单独创建之
* *
* @author looly * @author looly
*
*/ */
public class MmsegEngine implements TokenizerEngine { public class MmsegEngine implements TokenizerEngine {
private final MMSeg mmSeg; private final Seg seg;
/** /**
* 构造 * 构造
*/ */
public MmsegEngine() { public MmsegEngine() {
final Dictionary dict = Dictionary.getInstance(); this(new ComplexSeg(Dictionary.getInstance()));
final ComplexSeg seg = new ComplexSeg(dict);
this.mmSeg = new MMSeg(new StringReader(""), seg);
} }
/** /**
* 构造 * 构造
* *
* @param mmSeg 模式{@link MMSeg} * @param seg 模式{@link Seg}
*/ */
public MmsegEngine(final MMSeg mmSeg) { public MmsegEngine(final Seg seg) {
this.mmSeg = mmSeg; this.seg = seg;
} }
@Override @Override
public Result parse(final CharSequence text) { public Result parse(final CharSequence text) {
this.mmSeg.reset(StrUtil.getReader(text)); final MMSeg mmSeg = new MMSeg(StrUtil.getReader(text), seg);
return new MmsegResult(this.mmSeg); return new MmsegResult(mmSeg);
} }
} }

View File

@ -21,10 +21,10 @@ import org.dromara.hutool.extra.tokenizer.engine.TokenizerEngine;
/** /**
* MYNLP 中文NLP工具包分词实现<br> * MYNLP 中文NLP工具包分词实现<br>
* 项目地址https://github.com/mayabot/mynlp/ * 项目地址https://github.com/mayabot/mynlp/<br>
* {@link Lexer} 线程安全
* *
* @author looly * @author looly
*
*/ */
public class MynlpEngine implements TokenizerEngine { public class MynlpEngine implements TokenizerEngine {

View File

@ -22,7 +22,8 @@ import org.dromara.hutool.extra.tokenizer.engine.TokenizerEngine;
/** /**
* Word分词引擎实现<br> * Word分词引擎实现<br>
* 项目地址https://github.com/ysc/word * 项目地址https://github.com/ysc/word<br>
* {@link Segmentation} 线程安全
* *
* @author looly * @author looly
* *