NO IMAGE

文字情感分析

從上一篇完成了對新浪微博的爬取,以及模擬登入的問題,小編又開始研究對微博文
本的正面和反面分析,從網上搜尋了好多方法,有機器學習和情感字典,可是機器學
習需要比較深的知識鏈,而小編還是小白,所以就選擇了情感字典方法。好了,直接
上程式碼嘛,直接可以執行的。
前提需要安裝相關的庫jieba,
直接用pip install jieba
就能安裝,

一、資料準備

先要準備情感字典,

  1. 情感字典,如:
    最尼瑪 -6.70400012637
    擾民 -6.49756445867
    fuck… -6.32963390433
    RNM -6.21861284426
    wcnmlgb -5.96710044003
    2.5: -5.90459648251
  2. 停用字字典,如:
    !,”,#,$,&
  3. 副詞,如:
    百分之百 6
    倍加 6
    備至 6
    不得了 6
    不堪 6
    不可開交 6
    不亦樂乎 6
  4. 否定詞,如:不,沒,無,非,莫,弗,勿
    如果需要可以去CSDN去下載,我已經上傳了

二、情感分析

#!usr/bin/env python
#coding:utf-8
import jieba
class SentimentAnalysis:
#初始化
def __init__(self,sentiment,noword,adverb,stopword):
self.__readFile(sentiment,noword,adverb,stopword)
#讀取相關文庫
def __readFile(self,sentiment,noword,adverb,stopword):
self.__sentList = {}
self.__noword = []
self.__adverb = {}
self.__stopword = []
#情感詞
sentList = open(sentiment,'rb')
for s in sentList.readlines():
try:
s = s.replace('\r\n','').replace('\n','')
self.__sentList[s.split(' ')[0]] = s.split(' ')[1]
except:
pass
sentList.close()
nowordList = open(noword,'rb')
for s in nowordList.readlines():
try:
s = s.replace('\r\n','').replace('\n','')
self.__noword.append(s)
except:
print "資料錯誤:" s
nowordList.close()
adverbList = open(adverb,'rb') 
for s in adverbList.readlines():
try:
s = s.replace('\r\n','').replace('\n','')
self.__adverb[s.split(' ')[0]] = s.split(' ')[1]
except:
print "資料錯誤:" s
adverbList.close()
stopwordList = open(stopword,'rb')
for s in stopwordList.readlines():
try:
s = s.replace('\r\n','').replace('\n','')
self.__stopword.append(s) 
except:
print "資料錯誤:" s
stopwordList.close()
def setSentence(self,sentence):
self.__sentence = sentence.lstrip()
#預處理
def preDetail(self):
wordsList = jieba.cut(self.__sentence, cut_all=False)
newWords = {}
i = 0
for w in wordsList:
if w not in self.__stopword:
newWords[str(i)] =w
i = i 1
senWord = {}
notWord = {}
degreeWord = {}
m = 0
for index in newWords.keys():
if newWords[index] in self.__sentList.keys() and newWords[index] not in self.__noword and newWords[index] not in self.__adverb.keys():
senWord[index] = self.__sentList[newWords[index].encode('utf-8')]
elif newWords[index] in self.__noword and newWords[index] not in self.__adverb.keys():
notWord[index] = -1
elif newWords[index] in self.__adverb.keys():
degreeWord[index] = self.__adverb[newWords[index].encode('utf-8')]
else:
senWord[index] = 0
return senWord,notWord,degreeWord,newWords
def getScore(self):
senWord,notWord,degreeWord,newWords = self.preDetail()
W = 1
score = 0
# 存所有情感詞的位置的列表
senLoc = []
notLoc = []
degreeLoc = []
for i in senWord.keys():
senLoc.append(int(i))
for i in notWord.keys():
notLoc.append(int(i))
for i in degreeWord.keys():
degreeLoc.append(int(i))
senLoc.sort()
notLoc.sort()
degreeLoc.sort()
senloc = -1
for i in range(0, len(newWords)):
# 如果該詞為情感詞
if i in senLoc:
# loc為情感詞位置列表的序號
senloc  = 1
# 直接新增該情感詞分數
score  = W * float(senWord[str(i)])
# print "score = %f" % score
if senloc < len(senLoc) - 1:
# 判斷該情感詞與下一情感詞之間是否有否定詞或程度副詞
# j為絕對位置
if senLoc[senloc] - senLoc[senloc   1] > 1:
for j in range(senLoc[senloc] 1, senLoc[senloc   1]):
# 如果有否定詞
if j in notLoc:
W *= -1
# 如果有程度副詞
elif j in degreeLoc:
W *= float(degreeWord[j])
else:
W = 1
# i定位至下一個情感詞
if senloc < len(senLoc) - 1:
i = senLoc[senloc   1]
return score
def getAnalysis():
return SentimentAnalysis('情感字典.txt', '否定詞.txt', '副詞.txt', '停用詞.txt')
s = analysis.getAnalysis()
s.setSentence('句子')
#如果分數為正則為正面新聞
#如果位數為負則為負面新聞
print s.getScore()