Validating Custom Dictionary Integration and Text Segmentation in jieba
This test suite validates custom user dictionary functionality and word segmentation features in the Jieba Chinese text segmentation library. It covers dictionary loading, word addition/deletion, and segmentation accuracy with various text inputs.
Test Coverage Overview
Implementation Analysis
Technical Details
Best Practices Demonstrated
fxsjy/jieba
test/test_userdict.py
#encoding=utf-8
from __future__ import print_function, unicode_literals
import sys
sys.path.append("../")
import jieba
jieba.load_userdict("userdict.txt")
import jieba.posseg as pseg
jieba.add_word('石墨烯')
jieba.add_word('凱特琳')
jieba.del_word('自定义词')
test_sent = (
"李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿
"
"例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类
"
"「台中」正確應該不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。"
)
words = jieba.cut(test_sent)
print('/'.join(words))
print("="*40)
result = pseg.cut(test_sent)
for w in result:
print(w.word, "/", w.flag, ", ", end=' ')
print("
" + "="*40)
terms = jieba.cut('easy_install is great')
print('/'.join(terms))
terms = jieba.cut('python 的正则表达式是好用的')
print('/'.join(terms))
print("="*40)
# test frequency tune
testlist = [
('今天天气不错', ('今天', '天气')),
('如果放到post中将出错。', ('中', '将')),
('我们中出了一个叛徒', ('中', '出')),
]
for sent, seg in testlist:
print('/'.join(jieba.cut(sent, HMM=False)))
word = ''.join(seg)
print('%s Before: %s, After: %s' % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True)))
print('/'.join(jieba.cut(sent, HMM=False)))
print("-"*40)