process_trends.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
  1. #import urllib.request
  2. import urllib
  3. import requests
  4. import traceback
  5. from bs4 import BeautifulSoup
  6. import json
  7. import os
  8. import time
  9. import sys
  10. import random
  11. from seleniumwire import webdriver
  12. from selenium.webdriver.common.by import By
  13. from selenium.webdriver.support.ui import WebDriverWait, Select
  14. from selenium.webdriver.support import expected_conditions as EC
  15. from selenium.webdriver.common.keys import Keys
  16. from selenium.webdriver.remote.webdriver import WebDriver
  17. import dataset
  18. import docker
  19. import datetime
  20. import gzip
  21. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/gtrends?charset=utf8mb4')
  22. table=db['trending_search_flat']
  23. cursor=db.query('SELECT * FROM gtrends.trending_search_json order by id desc limit 24')
  24. for c in cursor:
  25. js=c['json']
  26. cid=c['id']
  27. dt=c['dt']
  28. jsobj=json.loads(js)
  29. for j in jsobj:
  30. print(j['title']['query'])
  31. qtitle=j['title']['query']
  32. for a in j['articles']:
  33. print(a['title'])
  34. atitle=a['title']
  35. if a.get('image')!= None:
  36. # print(a['image'])
  37. print(a['image']['imageUrl'])
  38. print(a['image']['newsUrl'])
  39. aimg=a['image']['imageUrl']
  40. aurl=a['image']['newsUrl']
  41. table.insert({'cid':cid,'qtitle':qtitle,'atitle':atitle,'aimg':aimg,'aurl':aurl,'dt':dt})
  42. for r in j['relatedQueries']:
  43. print("-->" +r['query'])