Python使用gensim计算文档相似性

WBOY
Release: 2016-06-10 15:05:25
Original
1295 people have browsed it

pre_file.py

#-*-coding:utf-8-*-
import MySQLdb
import MySQLdb as mdb
import os,sys,string
import jieba
import codecs
reload(sys)
sys.setdefaultencoding('utf-8')
#连接数据库
try:
  conn=mdb.connect(host='127.0.0.1',user='root',passwd='kongjunli',db='test1',charset='utf8')
except Exception,e:
  print e
  sys.exit()
#获取cursor对象操作数据库
cursor=conn.cursor(mdb.cursors.DictCursor) #cursor游标
#获取内容
sql='SELECT link,content FROM test1.spider;'
cursor.execute(sql)   #execute()方法,将字符串当命令执行
data=cursor.fetchall()#fetchall()接收全部返回结果行
f=codecs.open('C:\Users\kk\Desktop\hello-result1.txt','w','utf-8')
 
for row in data:    #row接收结果行的每行数据
  seg='/'.join(list(jieba.cut(row['content'],cut_all='False')))
  f.write(row['link']+' '+seg+'\r\n')
f.close()
 
cursor.close()
      #提交事务,在插入数据时必须

Copy after login

jiansuo.py

#-*-coding:utf-8-*-
import sys
import string
import MySQLdb
import MySQLdb as mdb
import gensim
from gensim import corpora,models,similarities
from gensim.similarities import MatrixSimilarity
import logging
import codecs
reload(sys)
sys.setdefaultencoding('utf-8')
 
con=mdb.connect(host='127.0.0.1',user='root',passwd='kongjunli',db='test1',charset='utf8')
with con:
  cur=con.cursor()
  cur.execute('SELECT * FROM cutresult_copy')
  rows=cur.fetchall()
  class MyCorpus(object):
    def __iter__(self):
      for row in rows:
        yield str(row[1]).split('/')
#开启日志
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',level=logging.INFO)
Corp=MyCorpus()
#将网页文档转化为tf-idf
dictionary=corpora.Dictionary(Corp)
corpus=[dictionary.doc2bow(text) for text in Corp] #将文档转化为词袋模型
#print corpus
tfidf=models.TfidfModel(corpus)#使用tf-idf模型得出文档的tf-idf模型
corpus_tfidf=tfidf[corpus]#计算得出tf-idf值
#for doc in corpus_tfidf:
  #print doc
###
'''
q_file=open('C:\Users\kk\Desktop\q.txt','r')
query=q_file.readline()
q_file.close()
vec_bow=dictionary.doc2bow(query.split(' '))#将请求转化为词带模型
vec_tfidf=tfidf[vec_bow]#计算出请求的tf-idf值
#for t in vec_tfidf:
 # print t
'''
###
query=raw_input('Enter your query:')
vec_bow=dictionary.doc2bow(query.split())
vec_tfidf=tfidf[vec_bow]
index=similarities.MatrixSimilarity(corpus_tfidf)
sims=index[vec_tfidf]
similarity=list(sims)
print sorted(similarity,reverse=True)

Copy after login

encodings.xml

<&#63;xml version="1.0" encoding="UTF-8"&#63;>
<project version="4">
 <component name="Encoding">
  <file url="PROJECT" charset="UTF-8" />
 </component>
</project>

Copy after login

misc.xml

<&#63;xml version="1.0" encoding="UTF-8"&#63;>
<project version="4">
 <component name="ProjectLevelVcsManager" settingsEditedManually="false">
  <OptionsSetting value="true" id="Add" />
  <OptionsSetting value="true" id="Remove" />
  <OptionsSetting value="true" id="Checkout" />
  <OptionsSetting value="true" id="Update" />
  <OptionsSetting value="true" id="Status" />
  <OptionsSetting value="true" id="Edit" />
  <ConfirmationsSetting value="0" id="Add" />
  <ConfirmationsSetting value="0" id="Remove" />
 </component>
 <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.11 (C:\Python27\python.exe)" project-jdk-type="Python SDK" />
</project>

Copy after login

modules.xml

<&#63;xml version="1.0" encoding="UTF-8"&#63;>
<project version="4">
 <component name="ProjectModuleManager">
  <modules>
   <module fileurl="file://$PROJECT_DIR$/.idea/爬虫练习代码.iml" filepath="$PROJECT_DIR$/.idea/爬虫练习代码.iml" />
  </modules>
 </component>
</project>
Copy after login

Related labels:
source:php.cn
Statement of this Website
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn
Popular Tutorials
More>
Latest Downloads
More>
Web Effects
Website Source Code
Website Materials
Front End Template