介绍
相关项目
- HanLP:https://github.com/hankcs/HanLP
- 示例代码 Java:https://github.com/hankcs/HanLP/tree/v1.7.5/src/test/java/com/hankcs/book
- 示例代码 Python:https://github.com/hankcs/pyhanlp/tree/master/tests/book
- 模型文件下载,
- 数据文件
Java
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.7.5</version>
</dependency>
一些工具
1、用 analyzer 中获取模型核心
PerceptronLexicalAnalyzer analyzer = new PerceptronLexicalAnalyzer(MODEL_PATH);
LinearModel segModel = analyzer.getPerceptronSegmenter().getModel();
LinearModel posModel = analyzer.getPerceptronPOSTagger().getModel();
LinearModel nerModel = analyzer.getPerceptionNERecognizer().getModel();
2、模型核心压缩和保存
// 关于模型压缩
LinearModel model = new LinearModel("src/model/cws.bin") // 读入模型
model.save("src/model/compress_cws.bin", 0.5, false)
// 参数分别是:新模型路径,压缩比例,是否同时保存一个 txt 供 debug
// 压缩到 0.5 其实准召下降不大
// 如果再次压缩,模型文件又会减半
3、 读取模型核心,并构建 analyzer
PerceptronLexicalAnalyzer analyzer_new = new PerceptronLexicalAnalyzer(
new PerceptronSegmenter("src/main/resources/model/cws.bin"),
new PerceptronPOSTagger("src/main/resources/model/pku199801_pos.bin"),
new PerceptronNERecognizer("src/main/resources/model/pku_198801_ner.bin")
);
import
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.dictionary.CustomDictionary;
import com.hankcs.hanlp.model.perceptron.CWSTrainer;
import com.hankcs.hanlp.model.perceptron.PerceptronLexicalAnalyzer;
import com.hankcs.hanlp.model.perceptron.model.LinearModel;
import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.common.CWSEvaluator;
import org.junit.Test;
import java.io.IOException;
step1:分词模型
static {
// 此 demo 不涉及词性
HanLP.Config.ShowTermNature = true;
}
public static String TRAIN_PATH = "src/data/test/icwb2-data/training/msr_training.utf8";
public static String TEST_PATH = "src/data/test/icwb2-data/testing/msr_test.utf8";
public static String GOLD_PATH = "src/data/test/icwb2-data/gold/msr_test_gold.utf8";
public static String SEG_MODEL_PATH = "src/data/test/msr_cws";
public static String OUTPUT_PATH = "src/data/test/msr_output.txt";
public static String TRAIN_WORDS = "src/data/test/icwb2-data/gold/msr_training_words.utf8";
// 模型训练和保存
@Test
public void tstSeg1() throws IOException {
CWSTrainer cwsTrainer = new CWSTrainer();
LinearModel segModel = cwsTrainer.train(TRAIN_PATH, SEG_MODEL_PATH).getModel(); // 训练模型
}
// 模型的读取、评估和使用
@Test
public void tstSeg2() throws IOException {
// 模型读取
LinearModel segModel = new LinearModel(SEG_MODEL_PATH);
Segment segment = new PerceptronLexicalAnalyzer(segModel).enableCustomDictionary(false);
// 模型评估
System.out.println(CWSEvaluator.evaluate(segment, TEST_PATH, OUTPUT_PATH, GOLD_PATH, TRAIN_WORDS)); // 标准化评测
// 模型使用
System.out.println(segment.seg("商品和服务"));
}
// 用规则调整分词
@Test
public void tstSeg3() throws IOException {
PerceptronLexicalAnalyzer segment = new PerceptronLexicalAnalyzer(SEG_MODEL_PATH);
segment.enableCustomDictionary(false);
System.out.println(segment.seg("与川普通电话"));
// > [与, 川, 普通, 电话]
// 加入自定义词典
CustomDictionary.insert("川普", "nrf 1");
segment.enableCustomDictionaryForcing(true);
System.out.println(segment.seg("与川普通电话"));
// 看似结果是好的: [与, 川普, 通电话]
System.out.println(segment.seg("银川普通人与川普通电话讲四川普通话"));
// 但是出了新问题:[银, 川普, 通人, 与, 川普, 通电话, 讲, 四, 川普, 通话]
}
// 在线学习
@Test
public void tstSeg4() throws IOException {
PerceptronLexicalAnalyzer segment = new PerceptronLexicalAnalyzer(SEG_MODEL_PATH);
segment.enableCustomDictionary(false);
System.out.println(segment.seg("与川普通电话"));
// > [与, 川, 普通, 电话]
for (int i = 0; i < 3; ++i) {
segment.learn("人 与 川普 通电话");
}
System.out.println(segment.seg("银川普通人与川普通电话讲四川普通话"));
// 结果完美! [银川, 普通人, 与, 川普, 通电话, 讲, 四川, 普通话]
}
step2:词性标注
模型训练和保存
public static String PKU199801_TRAIN = "src/data/test/pku98/199801-train.txt"; // 训练集
public static String PKU199801_TEST = "src/data/test/pku98/199801-test.txt"; // 测试集
public static String pos_model = "src/main/resources/model/pku199801_pos.bin"; // posTagger 模型保存位置
public static String NER_MODEL = "src/main/resources/model/pku199801_ner.bin"; // ner 模型保存位置
public static String perceptronModelPath = "src/data/model/perceptronModel_pos.bin";
// 训练并保存模型
@Test
public void tstPosTagger1() throws IOException {
PerceptronTrainer posTrainer = new POSTrainer();
//同态的train方法可以指定压缩率
LinearModel posModel = posTrainer.train(PKU199801_TRAIN, pos_model).getModel();
}
// 读取、评估模型和使用模型
@Test
public void tstPosTagger2() throws IOException {
LinearModel posModel = new LinearModel(pos_model);
PerceptronPOSTagger postTagger = new PerceptronPOSTagger(posModel);
//评估模型
System.out.printf("感知机 ACC = %.2f%%\n",PosTagUtil.evaluate(postTagger,PKU199801_TEST));
//使用模型做标注
System.out.println(Arrays.toString(postTagger.tag("多吃", "苹果", "有益", "健康")));
}
// 用规则调整pos
@Test
public void tstPosTagger3() throws IOException {
PerceptronLexicalAnalyzer perceptronLexicalAnalyzer = new PerceptronLexicalAnalyzer(
new PerceptronSegmenter("src/main/resources/model/cws.bin"),
new PerceptronPOSTagger("src/main/resources/model/pku199801_pos.bin"),
new PerceptronNERecognizer("src/main/resources/model/ner.bin")
);
CustomDictionary.insert("希望小学", "学校 1");
perceptronLexicalAnalyzer.enableCustomDictionaryForcing(true);
//仅标注
System.out.println(Arrays.toString(perceptronLexicalAnalyzer.tag("多吃", "苹果", "有益", "健康")));
//分词+标注
System.out.println(perceptronLexicalAnalyzer.analyze("他的希望是希望上希望小学"));
// 他/r 的/u 希望/n 是/v 希望/v 上/v 希望小学/学校
}
// 使用在线训练来调整模型。原本苹果为 n(名词),希望标注为nr(专有名词)
@Test
public void tstPosTagger4() throws IOException {
PerceptronLexicalAnalyzer perceptronLexicalAnalyzer = new PerceptronLexicalAnalyzer(
new PerceptronSegmenter("src/main/resources/model/cws.bin"),
new PerceptronPOSTagger("src/main/resources/model/pku199801_pos.bin"),
new PerceptronNERecognizer("src/main/resources/model/ner.bin")
);
//训练前
System.out.println(perceptronLexicalAnalyzer.analyze("他想买苹果手机"));
//他/r 想/v 买/v 苹果/n 手机/n
for (int i = 0; i < 100; i++) {
perceptronLexicalAnalyzer.learn("他/r 的/u 希望/n 是/v 买/v 苹果/nr 手机/n");
}
System.out.println(perceptronLexicalAnalyzer.analyze("他想买苹果手机"));
//他/r 想/v 买/v 苹果/nr 手机/n
}
词性标准有个映射关系,还挺坑的 看
step3:NER
// 训练并保存模型
@Test
public void tstNER1() throws IOException {
NERTrainer nerTrainer = new NERTrainer();
LinearModel nerModel = nerTrainer.train("src/data/test/pku98/199801.txt",
"src/main/resources/model/pku_198801_ner.bin").getModel();
}
// 读取、使用模型
@Test
public void tstNER2() throws IOException {
LinearModel nerModel = new LinearModel("src/main/resources/model/pku_198801_ner.bin");
PerceptronNERecognizer perceptronNERecognizer = new PerceptronNERecognizer(nerModel);
String[] wordArray = {"华北", "电力", "公司"}; // 构造单词序列
String[] posArray = {"ns", "n", "n"}; // 构造词性序列
String[] nerTagArray = perceptronNERecognizer.recognize(wordArray, posArray); // 序列标注
for (int i = 0; i < wordArray.length; i++)
System.out.printf("%s\t%s\t%s\t\n", wordArray[i], posArray[i], nerTagArray[i]);
//华北 ns B-nt
//电力 n M-nt
//公司 n E-nt
}
// 在线学习
@Test
public void tstNER3() throws IOException {
PerceptronLexicalAnalyzer perceptronLexicalAnalyzer = new PerceptronLexicalAnalyzer(
new PerceptronSegmenter("src/main/resources/model/cws.bin"),
new PerceptronPOSTagger("src/main/resources/model/pku199801_pos.bin"),
new PerceptronNERecognizer("src/main/resources/model/pku_198801_ner.bin")
);
//使用模型
perceptronLexicalAnalyzer.enableCustomDictionary(false);
System.out.println(perceptronLexicalAnalyzer.analyze("华北电力公司董事长谭旭光和秘书胡花蕊来到美国纽约现代艺术博物馆参观"));
//[华北/ns 电力/n 公司/n]/nt 董事长/n 谭旭光/nr 和/c 秘书/n 胡花蕊/nr 来到/v [美国/ns 纽约/ns 现代/t 艺术/n 博物馆/n]/ns 参观/v
//在线学习
System.out.println(perceptronLexicalAnalyzer.analyze("与特朗普通电话讨论太空探索技术公司"));
Sentence sentence = Sentence.create("与/c 特朗普/nr 通/v 电话/n 讨论/v [太空/s 探索/vn 技术/n 公司/n]/nt");
for (int i = 0; i < 20; i++) {
if (perceptronLexicalAnalyzer.analyze(sentence.text()).equals(sentence)) break;
perceptronLexicalAnalyzer.learn(sentence);
}
System.out.println(perceptronLexicalAnalyzer.analyze("与特朗普通电话讨论太空探索技术公司"));
}