使用 CoreNLP 的过程中,通常会需要基于其 pipeline 进行定制处理流程, 难免某些属性不被 CoreNLP 内置,或者其内置算法不能满足准确性时,就需要自定义修改或重实现算法或模型,这时少不了要定义 Annotator。
废话少说,直接上官网例子:
- 实现 Annotator 接口
package com.sample.nlp.annotator;
import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.pipeline.*;
import edu.stanford.nlp.io.*;
import edu.stanford.nlp.util.ArraySet;
import java.util.*;
// 首先必须实现 Annotator 接口
public class CustomLemmaAnnotator implements Annotator {
HashMap<String,String> wordToLemma = new HashMap<>();
// 定义 Annotator 构造函数,其中 Properties 是初始化的配置变量
public CustomLemmaAnnotator(Properties props) {
// load the lemma file
// format should be tsv with word and lemma
String lemmaFile = props.getProperty("custom.lemma.lemmaFile");
List<String> lemmaEntries = IOUtils.linesFromFile(lemmaFile);
for (String lemmaEntry : lemmaEntries) {
wordToLemma.put(lemmaEntry.split("\\t")[0], lemmaEntry.split("\\t")[1]);
}
}
// 执行 annotate 操作
public void annotate(Annotation annotation) {
for (CoreLabel token : annotation.get(CoreAnnotations.TokensAnnotation.class)) {
String lemma = wordToLemma.getOrDefault(token.word(), token.word());
token.set(CoreAnnotations.LemmaAnnotation.class, lemma);
}
}
// 设置改操作必备条件 Annotation 属性列表
@Override
public Set<Class<? extends CoreAnnotation>> requires() {
return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
CoreAnnotations.TextAnnotation.class,
CoreAnnotations.TokensAnnotation.class,
CoreAnnotations.SentencesAnnotation.class,
CoreAnnotations.PartOfSpeechAnnotation.class
)));
}
// 设置必须依赖属性参数
@Override
public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
return Collections.singleton(CoreAnnotations.LemmaAnnotation.class);
}
}
- 配置自定义属性,并加载到对应的 pipeline 中
$ cat custom-annotator.properties
# 自定义 Annotator
# 定义格式: customAnnotatorClass.<配置名称> = <类>
# 注:<配置名称> 为当前配置文件中默认前缀
customAnnotatorClass.custom.lemma = com.sample.nlp.annotator.CustomLemmaAnnotator
#设定了管道中包括哪些Annotators(一个Annotator就是你需要的文本分析分析工具, 他的结果就是一个或多个Annotation)
# tokenize:分词,
# ssplit:分句,
# pos: 词性标注,
# lemma: has->have 词根,
# ner:命名实体识别,
annotators = tokenize, ssplit, pos, lemma, ner, parse, custom.lemma
# segment 分词
tokenize.language = zh
tokensregex.rules = relregex/basic-relation.rules
segment.model = nlp/models/segmenter/chinese/ctb.gz
segment.sighanCorporaDict = nlp/models/segmenter/chinese
segment.serDictionary = nlp/models/segmenter/chinese/dict-chris6.ser.gz
segment.sighanPostProcessing = true
# sentence split
ssplit.boundaryTokenRegex = [.]|[!?]+|[\u3002]|[\uFF01\uFF1F]+
ssplit.boundaryMultiTokenRegex = /(?:\\n|\\*NL\\*|\\r)/{2,}
# 新增 custom lemma 配置
custom.lemma.lemmaFile = custom-lemmas.txt
# pos
pos.model = nlp/models/pos-tagger/chinese-distsim/chinese-distsim.tagger
# ner
ner.language = chinese
ner.model = nlp/models/ner/chinese.misc.distsim.crf.ser.gz
ner.applyNumericClassifiers = true
ner.useSUTime = false
- 加载执行即可
String text = "我有只快乐的小猫咪哦";
StanfordCoreNLP pipeline = new StanfordCoreNLP("imyaqa/imyaqa-tokenregex.properties");
CoreDocument coreDocument = new CoreDocument(text);
pipeline.annotate(coreDocument);
for (CoreLabel token : coreDocument.tokens()) {
System.out.println(token.get(LemmaAnnotation.class));
}
网友评论