main.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. from pydoc import HTMLDoc
  2. from fastapi import FastAPI
  3. import dataset
  4. import sys
  5. import os
  6. import time
  7. from fastapi.middleware.cors import CORSMiddleware
  8. from fastapi.staticfiles import StaticFiles
  9. from pydantic import BaseModel
  10. from fastapi import FastAPI, Form, Response, File, UploadFile, Request
  11. import subprocess
  12. import suggests
  13. from typing import Optional
  14. import networkx as nx
  15. import pyvis
  16. import time
  17. from pyvis.network import Network
  18. import pickle
  19. import logging
  20. import threading
  21. import random
  22. import string
  23. from fastapi.responses import HTMLResponse,RedirectResponse, FileResponse
  24. import dataset
  25. import traceback
  26. import time
  27. from selenium import webdriver
  28. from selenium.webdriver.common.keys import Keys
  29. from selenium.webdriver.common.by import By
  30. from selenium.webdriver.chrome.service import Service
  31. import networkx as nx
  32. from pyvis.network import Network
  33. import csv
  34. import sys
  35. import codecs
  36. import difflib
  37. import pymysql
  38. pymysql.install_as_MySQLdb()
  39. from pathlib import Path
  40. from tempfile import NamedTemporaryFile
  41. from typing import Callable
  42. import shutil
  43. driver = None
  44. def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
  45. return ''.join(random.choice(chars) for _ in range(size))
  46. app = FastAPI()
  47. origins = ["*"]
  48. app.add_middleware(
  49. CORSMiddleware,
  50. allow_origins=origins,
  51. allow_credentials=True,
  52. allow_methods=["*"],
  53. allow_headers=["*"],
  54. )
  55. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gtrends?charset=utf8mb4')
  56. app.mount("/web", StaticFiles(directory="/Users/zooeytsai/kw_tools/web/static"), name="static")
  57. # app.mount("/web", StaticFiles(directory="/root/src/kw_tools/web/static"), name="static")
  58. def thread_function(kw):
  59. global db
  60. print(kw)
  61. G = nx.Graph()
  62. for k in kw:
  63. s = suggests.suggests.get_suggests(k, source='google')
  64. for sg in s['suggests']:
  65. G.add_edge(k,sg,weight=1)
  66. print(sg)
  67. time.sleep(1)
  68. s2 = suggests.suggests.get_suggests(k, source='google')
  69. for elmt in s2['suggests']:
  70. G.add_edge(sg,elmt,weight=1)
  71. # G.remove_nodes_from(list(nx.isolates(G)))
  72. G.remove_edges_from( list(nx.selfloop_edges(G)))
  73. # pickle.dump( G, open( "gs2.p", "wb" ) )
  74. pyG = Network(height="750px", width="100%",bgcolor="#333333",font_color="white")
  75. pyG.from_nx(G)
  76. id=id_generator()
  77. db['gen_graph'].insert({'filename':str(id),'kw':str(kw)})
  78. # pyG.save_graph('gstest')
  79. # pyG.show('static/gs/'+str(id)+'.html')
  80. pyG.save_graph('static/gs/'+str(id)+'.html')
  81. @app.get("/tree_list/",response_class=HTMLResponse)
  82. async def tree_list():
  83. # global db
  84. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gtrends?charset=utf8mb4')
  85. html="<html><body><h2>清單</h2></br>請一分鐘後refresh </br></br>"
  86. html+="<table border='1'>"
  87. cursor=db.query('select filename,kw from gen_graph order by id desc')
  88. cnt=0
  89. for c in cursor:
  90. html+="<tr><td>"+c['kw']+"</td>"
  91. html+="<td><a href='/web/gs/"+c['filename']+".html'>"+c['filename']+"</a></td></tr>"
  92. cnt+=1
  93. if cnt > 10:
  94. break
  95. html+="</table></body></html>"
  96. return html
  97. @app.post("/proj_kw/",response_class=HTMLResponse)
  98. async def proj_kw(proj: str = Form(...),kws:Optional[str] = Form(None)):
  99. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  100. table=db['serp_jobs']
  101. for kw in kws:
  102. table.insert({'proj':proj,'kw':kw})
  103. return "OK請稍後"
  104. #response_class=RedirectResponse
  105. @app.post("/gen_tree/",response_class=HTMLResponse)
  106. async def func_expand(kw: str = Form(...),kw2:Optional[str] = Form(None),kw3:Optional[str] = Form(None),kw4:Optional[str] = Form(None) ):
  107. kwlst=[]
  108. if len(kw)>1:
  109. kwlst.append(kw)
  110. if kw2 is not None:
  111. kwlst.append(kw2)
  112. if kw3 is not None:
  113. kwlst.append(kw3)
  114. if kw4 is not None:
  115. kwlst.append(kw4)
  116. x = threading.Thread(target=thread_function, args=(kwlst,))
  117. x.start()
  118. # return "ok"
  119. return RedirectResponse(url="/tree_list",status_code=302)
  120. # return HTMLResponse('<html><head><meta http-equiv="refresh" content="0; URL="/tree_list" /></head></html>')
  121. def restart_browser():
  122. global driver
  123. if driver is not None:
  124. print('closing')
  125. driver.quit()
  126. driver = None
  127. try:
  128. options = webdriver.ChromeOptions()
  129. options.add_argument("--no-sandbox")
  130. options.add_argument("--disable-dev-shm-usage")
  131. options.add_argument('--headless')
  132. #options.add_argument('--remote-debugging-port=9222')
  133. #options.add_experimental_option("debuggerAddress", "127.0.0.1:9922")
  134. options.add_argument("--incognito")
  135. try:
  136. driver = webdriver.Chrome(options=options)
  137. str1 = driver.capabilities['chrome']['chromedriverVersion'].split(' ')[0]
  138. print('這裡',str1)
  139. #driver = webdriver.Remote(command_executor='http://127.0.0.1:'+str(portnum)+'/wd/hub',options=options)
  140. except:
  141. return None
  142. except:
  143. print('開啟失敗')
  144. driver=None
  145. return None
  146. return driver
  147. @app.post("/ranking/")
  148. async def ranking(kw: str = Form(...), domain:str = Form(...),kw2:Optional[str] = Form(None),domain2:Optional[str] = Form(None),kw3:Optional[str] = Form(None),domain3:Optional[str] = Form(None),kw4:Optional[str] = Form(None),domain4:Optional[str] = Form(None),kw5:Optional[str] = Form(None),domain5:Optional[str] = Form(None)):
  149. kwlst = []
  150. kwlst.append([kw,domain])
  151. if kw2 is not None:
  152. kwlst.append([kw2,domain2])
  153. if kw3 is not None:
  154. kwlst.append([kw3,domain3])
  155. if kw4 is not None:
  156. kwlst.append([kw4,domain4])
  157. if kw5 is not None:
  158. kwlst.append([kw5,domain5])
  159. result = []
  160. for i in kwlst:
  161. driver = restart_browser()
  162. # escaped_search_term=urllib.parse.quote(term)
  163. googleurl = 'https://www.google.com/?num=100'
  164. driver.get(googleurl)
  165. time.sleep(6)
  166. send_kw_elmt = driver.find_element(By.XPATH,
  167. '/html/body/div[1]/div[3]/form/div[1]/div[1]/div[1]/div/div[2]/input')
  168. send_kw_elmt.send_keys(i[0])
  169. time.sleep(3)
  170. send_kw_elmt.send_keys(Keys.ENTER)
  171. time.sleep(6)
  172. elmts = driver.find_elements_by_xpath("//div[@class='yuRUbf']/a")
  173. cnt = 1
  174. datadict = {'搜尋詞': [], '結果標題': [], '結果網址': [], '結果名次': []}
  175. domain_name = i[1]
  176. for elmt in elmts:
  177. try:
  178. href = elmt.get_attribute('href')
  179. if domain_name in href:
  180. datadict['搜尋詞'].append(i[0])
  181. datadict['結果標題'].append(elmt.text)
  182. datadict['結果網址'].append(href)
  183. datadict['結果名次'].append(str(cnt))
  184. cnt += 1
  185. except:
  186. print('href2 exception')
  187. traceback.print_exc()
  188. result.append(datadict)
  189. print(domain_name)
  190. print(datadict)
  191. driver.quit()
  192. print('數量',len(elmts))
  193. time.sleep(90)
  194. # return "ok"
  195. # return RedirectResponse(url="/ranking_result",)
  196. html = f"<html><body>{result}</body></html>"
  197. return html
  198. @app.get("/ranking_result/")
  199. async def tree_list():
  200. html = "<table border='1'>"
  201. # html += "<tr><td>" + c['kw'] + "</td>"
  202. return html
  203. kwdict={}
  204. G = nx.Graph()
  205. def gcm0(strings):
  206. clusters = {}
  207. for string in (x.strip() for x in strings):
  208. match = difflib.get_close_matches(string, clusters.keys(), 8, 0.65)
  209. if match:
  210. clusters[match[0]].append(string)
  211. else:
  212. clusters[string] = [ string ]
  213. return clusters
  214. def proc_row(row):
  215. elmts=row.split(' ')
  216. for elmt in elmts:
  217. if kwdict.get(elmt) is None:
  218. kwdict[elmt]=1
  219. else:
  220. kwdict[elmt]+=1
  221. def save_upload_file_tmp(file: UploadFile) -> Path:
  222. try:
  223. suffix = Path(file.filename).suffix
  224. with NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
  225. shutil.copyfileobj(file.file, tmp)
  226. tmp_path = Path(tmp.name)
  227. finally:
  228. file.file.close()
  229. return tmp_path
  230. @app.post("/kwtree")
  231. async def kwtree(file: UploadFile = File(...)):
  232. destination_file_path = "/root/src/kw_tools/web" + file.filename
  233. with codecs.open(destination_file_path,'r','utf-16') as out_file:
  234. # with codecs.open(file.file) as csvfile:
  235. # csv_reader = csv.reader(codecs.iterdecode(file.file, 'utf-8'))
  236. csvfile = csv.reader(out_file, delimiter='\t', quotechar='|')
  237. # spamreader = csv.reader(csvfile, delimiter='\t', quotechar='|')
  238. kwdict = {}
  239. addict = {}
  240. head = True
  241. rowlst = []
  242. for row in csvfile:
  243. if head:
  244. head = False
  245. continue
  246. ll = len(row)
  247. proc_row(row[0])
  248. if row not in rowlst:
  249. rowlst.append(row[0])
  250. head = True
  251. clusters = gcm0(rowlst)
  252. keys = []
  253. for k, v in clusters.items():
  254. # if len(v) > 20:
  255. keys.append(k)
  256. for x in v:
  257. G.add_edge(k, x, weight=1, label='')
  258. already_dict = {}
  259. from strsimpy.qgram import QGram
  260. qgram = QGram(2)
  261. for k1 in keys:
  262. for k2 in keys:
  263. if k1 != k2:
  264. if qgram.distance(k1, k2) <= 12:
  265. if already_dict.get(k1) is None and already_dict.get(k2) is None:
  266. already_dict[k1] = 1
  267. already_dict[k2] = 1
  268. G.add_edge(k1, k2, weight=1, label='')
  269. pyG = Network(height="100%", width="100%", bgcolor="#444444", font_color="white")
  270. pyG.set_options("""
  271. const options = {
  272. "nodes" : {
  273. "font" : {
  274. "size" : "30",
  275. "color" : "#ffffff"
  276. }
  277. },
  278. "physics": {
  279. "forceAtlas2Based": {
  280. "springLength": 100
  281. },
  282. "maxVelocity": 150,
  283. "minVelocity": 0.28,
  284. "solver": "forceAtlas2Based"
  285. }
  286. }
  287. """)
  288. G.remove_edges_from(nx.selfloop_edges(G))
  289. pyG.from_nx(G)
  290. # pyG.show_buttons(filter_=['physics'])
  291. news_file = random.randint(0,100)
  292. pyG.show(f'news{news_file}.html')
  293. check_file = False
  294. # while
  295. # print(clusters)
  296. # sys.exit()
  297. return FileResponse(f'/root/kw_tools/web/news{news_file}.html',media_type='text/html')