get_google_id.py 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.webdriver.common.by import By
  8. from bs4 import BeautifulSoup
  9. from utility import database_access as DA
  10. from utility.parseutils import *
  11. from utility.connect import *
  12. import dataset
  13. import sys
  14. from datetime import datetime
  15. import pandas as pd
  16. import time
  17. import json
  18. import re, os
  19. def brower_start(port):
  20. options = webdriver.ChromeOptions()
  21. # browser = webdriver.Chrome(options=options)
  22. browser = webdriver.Remote(
  23. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  24. desired_capabilities=options.to_capabilities()
  25. )
  26. return browser
  27. def main():
  28. port=4444
  29. if len(sys.argv) >1:
  30. port=int(sys.argv[1])
  31. print('restart docker p{}'.format(port))
  32. os.system('sudo docker container restart p'+str(port))
  33. time.sleep(8)
  34. print('drvier start...')
  35. driver = brower_start(port)
  36. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  37. table=db['shop_list2']
  38. for i in range(1):
  39. result = db.query('select * from shop_list2 where google_id is null ORDER BY RAND() limit 20')
  40. result = pd.DataFrame([i for i in result])
  41. for key, group in result.iterrows():
  42. unique_id = group['unique_id']
  43. item_url = group['item_url']
  44. url = url = 'view-source:' + item_url
  45. driver.get(url)
  46. time.sleep(0.5)
  47. sourcetext = driver.page_source
  48. google_id = re.findall('null,\\\\"ChIJ[a-zA-Z0-9-_+]*\\\\"', sourcetext)[0].replace('null,','').replace('\\','').replace('"','')
  49. print(google_id)
  50. table.upsert({'unique_id': unique_id,'google_id':google_id},['unique_id'])
  51. if __name__ == '__main__':
  52. main()