kw_plan_tree.py 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. import searchconsole
  2. import dataset
  3. import os
  4. import networkx as nx
  5. from pyvis.network import Network
  6. import sqlite3
  7. import csv
  8. import sys
  9. import codecs
  10. import difflib
  11. kwdict={}
  12. G = nx.Graph()
  13. def gcm0(strings):
  14. clusters = {}
  15. for string in (x.strip() for x in strings):
  16. match = difflib.get_close_matches(string, clusters.keys(), 8, 0.65)
  17. if match:
  18. clusters[match[0]].append(string)
  19. else:
  20. clusters[string] = [ string ]
  21. return clusters
  22. def proc_row(row):
  23. elmts=row.split(' ')
  24. for elmt in elmts:
  25. if kwdict.get(elmt) is None:
  26. kwdict[elmt]=1
  27. else:
  28. kwdict[elmt]+=1
  29. with codecs.open('C:\\tmp\\test9.csv', 'r','utf-16') as csvfile:
  30. spamreader = csv.reader(csvfile, delimiter='\t', quotechar='|')
  31. kwdict={}
  32. addict={}
  33. head=True
  34. rowlst=[]
  35. for row in spamreader:
  36. if head:
  37. head=False
  38. continue
  39. ll=len(row)
  40. proc_row(row[0])
  41. rowlst.append(row[0])
  42. clusters=gcm0(rowlst)
  43. keys=[]
  44. for k,v in clusters.items():
  45. keys.append(k)
  46. for x in v:
  47. G.add_edge(k,x,weight=1,label='')
  48. already_dict={}
  49. from strsimpy.qgram import QGram
  50. qgram = QGram(2)
  51. for k1 in keys:
  52. for k2 in keys:
  53. if k1!=k2:
  54. if qgram.distance(k1, k2)<=12:
  55. if already_dict.get(k1) is None and already_dict.get(k2) is None:
  56. already_dict[k1]=1
  57. already_dict[k2]=1
  58. G.add_edge(k1,k2,weight=1,label='')
  59. pyG = Network(height="900px", width="100%", bgcolor="#444444", font_color="white")
  60. pyG.set_options("""
  61. const options = {
  62. "nodes" : {
  63. "font" : {
  64. "size" : "30",
  65. "color" : "#ffffff"
  66. }
  67. },
  68. "physics": {
  69. "forceAtlas2Based": {
  70. "springLength": 100
  71. },
  72. "maxVelocity": 150,
  73. "minVelocity": 0.28,
  74. "solver": "forceAtlas2Based"
  75. }
  76. }
  77. """)
  78. G.remove_edges_from(nx.selfloop_edges(G))
  79. pyG.from_nx(G)
  80. #pyG.show_buttons(filter_=['physics'])
  81. pyG.show('news17.html')
  82. #print(clusters)
  83. sys.exit()