harvesttext.resources 源代码

#coding=utf-8
#!/usr/bin/env python

# Resources

# 褒贬义词典   清华大学 李军
#
# 此资源被用于以下论文中:
# Jun Li and Maosong Sun, Experimental Study on Sentiment Classification of Chinese Review using Machine Learning Techniques, in Proceding of IEEE NLPKE 2007
# 李军 中文评论的褒贬义分类实验研究 硕士论文 清华大学 2008
import os
import json

[文档]def get_qh_sent_dict(): """ 获得参考褒贬义词典: 褒贬义词典 清华大学 李军 此资源被用于以下论文中: Jun Li and Maosong Sun, Experimental Study on Sentiment Classification of Chinese Review using Machine Learning Techniques, in Proceding of IEEE NLPKE 2007 李军 中文评论的褒贬义分类实验研究 硕士论文 清华大学 2008 :return: qh_sent_dict = {"pos":[words],"neg":[words]} """ pwd = os.path.abspath(os.path.dirname(__file__)) with open(pwd+"/resources/qh_sent_dict.json","r",encoding="utf-8") as f: qh_sent_dict = json.load(f) return qh_sent_dict
[文档]def get_baidu_stopwords(): """ 获得百度停用词列表 来源,网上流传的版本:https://wenku.baidu.com/view/98c46383e53a580216fcfed9.html 包含了中英文常见词及部分标点符号 :return: stopwords: set of string """ pwd = os.path.abspath(os.path.dirname(__file__)) with open(pwd + "/resources/百度停用词列表.json", "r", encoding="utf-8") as f: stopwords = json.load(f) return set(stopwords)
[文档]def get_qh_typed_words(used_types = ['IT', '动物', '医药', '历史人名', '地名', '成语', '法律', '财经', '食物']): """ THUOCL:清华大学开放中文词库 http://thuocl.thunlp.org/ IT 财经 成语 地名 历史名人 诗词 医学 饮食 法律 汽车 动物 :param used_types: :return: typed_words: 字典,键为类型,值为该类的词语组成的set """ pwd = os.path.abspath(os.path.dirname(__file__)) with open(pwd + "/resources/THUOCL.json", "r", encoding="utf-8") as f: typed_words0 = json.load(f) typed_words = dict() for type0 in typed_words0: if type0 in used_types: typed_words[type0] = set(typed_words0[type0]) return typed_words
[文档]def get_sanguo(): """ 获得三国演义原文 :return: ["章节1文本","章节2文本",...] """ pwd = os.path.abspath(os.path.dirname(__file__)) with open(pwd+"/resources/sanguo_docs.json","r",encoding="utf-8") as f: docs = json.load(f) return docs
[文档]def get_sanguo_entity_dict(): """ 获得三国演义中的人名、地名、势力名的知识库。 自行搭建的简单版,一定有遗漏和错误,仅供参考使用 :return: entity_mention_dict,entity_type_dict """ import json pwd = os.path.abspath(os.path.dirname(__file__)) with open(pwd+"/resources/sanguo_entity_dict.json","r",encoding="utf-8") as f: entity_dict = json.load(f) return entity_dict["mention"], entity_dict["type"]