|
@@ -0,0 +1,101 @@
|
|
|
+import searchconsole
|
|
|
+import dataset
|
|
|
+import os
|
|
|
+import networkx as nx
|
|
|
+from pyvis.network import Network
|
|
|
+import sqlite3
|
|
|
+import csv
|
|
|
+import sys
|
|
|
+import codecs
|
|
|
+import difflib
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+kwdict={}
|
|
|
+G = nx.Graph()
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def gcm0(strings):
|
|
|
+ clusters = {}
|
|
|
+ for string in (x.strip() for x in strings):
|
|
|
+ match = difflib.get_close_matches(string, clusters.keys(), 8, 0.65)
|
|
|
+ if match:
|
|
|
+ clusters[match[0]].append(string)
|
|
|
+ else:
|
|
|
+ clusters[string] = [ string ]
|
|
|
+ return clusters
|
|
|
+
|
|
|
+
|
|
|
+def proc_row(row):
|
|
|
+ elmts=row.split(' ')
|
|
|
+ for elmt in elmts:
|
|
|
+ if kwdict.get(elmt) is None:
|
|
|
+ kwdict[elmt]=1
|
|
|
+ else:
|
|
|
+ kwdict[elmt]+=1
|
|
|
+
|
|
|
+with codecs.open('C:\\tmp\\test.csv', 'r','utf-16') as csvfile:
|
|
|
+ spamreader = csv.reader(csvfile, delimiter='\t', quotechar='|')
|
|
|
+ kwdict={}
|
|
|
+ addict={}
|
|
|
+ head=True
|
|
|
+ rowlst=[]
|
|
|
+ for row in spamreader:
|
|
|
+ if head:
|
|
|
+ head=False
|
|
|
+ continue
|
|
|
+ ll=len(row)
|
|
|
+ proc_row(row[0])
|
|
|
+ rowlst.append(row[0])
|
|
|
+
|
|
|
+
|
|
|
+clusters=gcm0(rowlst)
|
|
|
+keys=[]
|
|
|
+for k,v in clusters.items():
|
|
|
+ keys.append(k)
|
|
|
+ for x in v:
|
|
|
+ G.add_edge(k,x,weight=1,label='')
|
|
|
+
|
|
|
+
|
|
|
+already_dict={}
|
|
|
+from strsimpy.qgram import QGram
|
|
|
+qgram = QGram(2)
|
|
|
+for k1 in keys:
|
|
|
+ for k2 in keys:
|
|
|
+ if k1!=k2:
|
|
|
+ if qgram.distance(k1, k2)<=12:
|
|
|
+ if already_dict.get(k1) is None and already_dict.get(k2) is None:
|
|
|
+ already_dict[k1]=1
|
|
|
+ already_dict[k2]=1
|
|
|
+ G.add_edge(k1,k2,weight=1,label='')
|
|
|
+
|
|
|
+
|
|
|
+pyG = Network(height="600px", width="100%", bgcolor="#444444", font_color="white")
|
|
|
+pyG.set_options("""
|
|
|
+const options = {
|
|
|
+"nodes" : {
|
|
|
+ "font" : {
|
|
|
+ "size" : "30",
|
|
|
+ "color" : "#ffffff"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "physics": {
|
|
|
+ "forceAtlas2Based": {
|
|
|
+ "springLength": 100
|
|
|
+ },
|
|
|
+ "maxVelocity": 150,
|
|
|
+ "minVelocity": 0.28,
|
|
|
+ "solver": "forceAtlas2Based"
|
|
|
+ }
|
|
|
+}
|
|
|
+""")
|
|
|
+G.remove_edges_from(nx.selfloop_edges(G))
|
|
|
+pyG.from_nx(G)
|
|
|
+
|
|
|
+#pyG.show_buttons(filter_=['physics'])
|
|
|
+pyG.show('news17.html')
|
|
|
+
|
|
|
+#print(clusters)
|
|
|
+
|
|
|
+sys.exit()
|