process_gtrend.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. #import urllib.request
  2. import urllib
  3. import requests
  4. import traceback
  5. from bs4 import BeautifulSoup
  6. import json
  7. import os
  8. import time
  9. import sys
  10. import random
  11. from seleniumwire import webdriver
  12. from selenium.webdriver.common.by import By
  13. from selenium.webdriver.support.ui import WebDriverWait, Select
  14. from selenium.webdriver.support import expected_conditions as EC
  15. from selenium.webdriver.common.keys import Keys
  16. from selenium.webdriver.remote.webdriver import WebDriver
  17. import dataset
  18. import docker
  19. import datetime
  20. import gzip
  21. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gtrends?charset=utf8mb4')
  22. table=db['kw_related']
  23. lst=[]
  24. cursor=db.query('SELECT distinct kw FROM gtrends.gtrend_jsraw order by id desc')
  25. for c in cursor:
  26. lst.append(c['kw'])
  27. for l in lst:
  28. fullkw=l
  29. alldict={}
  30. cursor=db.query('SELECT * FROM gtrends.gtrend_jsraw where kw="'+fullkw+'" order by id desc')
  31. for c in cursor:
  32. js=c['json']
  33. jsobj=json.loads(js)
  34. # jsobj=jsobj['rankedKeyword']
  35. for j in jsobj:
  36. kws=j['rankedKeyword']
  37. for kw in kws:
  38. if 'query' in kw:
  39. print(kw['query'])
  40. alldict[kw['query']]=1
  41. for k,v in alldict.items():
  42. try:
  43. table.insert({'original':fullkw,'kw':k})
  44. except:
  45. print('except')
  46. print(k)
  47. # if len(alldict)>=5:
  48. # break
  49. print(alldict)
  50. # break
  51. # print(j['title']['query'])
  52. # for a in j['articles']:
  53. # print(a['title'])
  54. # if a.get('image')!= None:
  55. # print(a['image'])
  56. # print(a['image']['imageUrl'])
  57. ## print(a['image']['newsUrl'])
  58. # for r in j['relatedQueries']:
  59. # print("-->" +r['query'])