知识图谱:【数据清洗工具flashtext(五)】——flashtext使用示例
文章目录关键字提取删除关键字函数封装示例pyahocorasick版关键字提取from flashtext import KeywordProcessorkeyword_processor = KeywordProcessor()keyword_processor.add_keyword('Big Apple', 'New York')keyword_processor.add_keyword('
·
文章目录
关键字提取
from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('Big Apple', 'New York')
keyword_processor.add_keyword('Bay Area')
keywords_found = keyword_processor.extract_keywords('I love Big Apple and Bay Area.')
>>> keywords_found
>>> # ['New York', 'Bay Area']
## 区分大小写
from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor(case_sensitive=True)
keyword_processor.add_keyword('Big Apple', 'New York')
keyword_processor.add_keyword('Bay Area')
keywords_found = keyword_processor.extract_keywords('I love big Apple and Bay Area.')
>>> keywords_found
>>> # ['Bay Area']
同时添加多个关键词
from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor()
keyword_dict = {
"java": ["java_2e", "java programing"],
"product management": ["PM", "product manager"]
}
# {'clean_name': ['list of unclean names']}
keyword_processor.add_keywords_from_dict(keyword_dict)
# Or add keywords from a list:
keyword_processor.add_keywords_from_list(["java", "python"])
keyword_processor.extract_keywords('I am a product manager for a java_2e platform')
# output ['product management', 'java']
删除关键字
from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor()
keyword_dict = {
"java": ["java_2e", "java programing"],
"product management": ["PM", "product manager"]
}
keyword_processor.add_keywords_from_dict(keyword_dict)
print(keyword_processor.extract_keywords('I am a product manager for a java_2e platform'))
# output ['product management', 'java']
keyword_processor.remove_keyword('java_2e')
# you can also remove keywords from a list/ dictionary
keyword_processor.remove_keywords_from_dict({"product management": ["PM"]})
keyword_processor.remove_keywords_from_list(["java programing"])
keyword_processor.extract_keywords('I am a product manager for a java_2e platform')
# output ['product management']
函数封装示例
from flashtext import KeywordProcessor
def build_actree(wordlist):
'''
AC自动机进行关键词匹配
构造AC trie
'''
actree = KeywordProcessor()
for index, word in enumerate(wordlist):
actree.add_keyword(word) # 向trie树中添加单词
#self.actree = actree
return actree
def ac_detect(actree,text,span_info = True):
'''
AC自动机进行关键词匹配
文本匹配
'''
region_wds = []
for w1 in actree.extract_keywords(text,span_info = span_info):
if len(w1) > 0:
region_wds.append(w1[0])
return region_wds
wordlist = ['健康','减肥']
text = '今天你减肥了吗,今天你健康了吗,减肥 = 健康!'
actree = build_actree(wordlist)
ac_detect(actree,text)
>>> CPU times: user 41 µs, sys: 0 ns, total: 41 µs
>>> Wall time: 47.2 µs
>>> ['减肥', '健康', '减肥', '健康']
pyahocorasick版
import ahocorasick
def build_actree(wordlist):
'''
AC自动机进行关键词匹配
构造AC trie
'''
actree = ahocorasick.Automaton() # 初始化trie树
for index, word in enumerate(wordlist):
actree.add_word(word, (index, word)) # 向trie树中添加单词
actree.make_automaton() # 将trie树转化为Aho-Corasick自动机
#self.actree = actree
return actree
def ac_detect(actree,text):
'''
AC自动机进行关键词匹配
文本匹配
'''
region_wds = []
for w1 in actree.iter(text):
if len(w1) > 0:
region_wds.append(w1[1][1])
return region_wds
wordlist = ['健康','减肥']
text = '今天你减肥了吗,今天你健康了吗,减肥 = 健康!'
actree = build_actree(wordlist)
ac_detect(actree,text)
>>> CPU times: user 10 µs, sys: 3 µs, total: 13 µs
>>> Wall time: 17.4 µs
>>> ['减肥', '健康', '减肥', '健康']
魔乐社区(Modelers.cn) 是一个中立、公益的人工智能社区,提供人工智能工具、模型、数据的托管、展示与应用协同服务,为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作,由全产业链共同建设、共同运营、共同享有,推动国产AI生态繁荣发展。
更多推荐

所有评论(0)