# -*- coding: utf-8 -*- from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.by import By from bs4 import BeautifulSoup from utility import database_access as DA from utility.parseutils import * from utility.connect import * import dataset import sys from datetime import datetime import pandas as pd import time import json import re, os def brower_start(port): options = webdriver.ChromeOptions() # browser = webdriver.Chrome(options=options) browser = webdriver.Remote( command_executor='http://127.0.0.1:'+str(port)+'/wd/hub', desired_capabilities=options.to_capabilities() ) return browser def main(): port=4444 if len(sys.argv) >1: port=int(sys.argv[1]) print('restart docker p{}'.format(port)) os.system('sudo docker container restart p'+str(port)) time.sleep(8) print('drvier start...') driver = brower_start(port) db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4') table=db['shop_list2'] for i in range(1): result = db.query('select * from shop_list2 where google_id is null ORDER BY RAND() limit 20') result = pd.DataFrame([i for i in result]) for key, group in result.iterrows(): unique_id = group['unique_id'] item_url = group['item_url'] url = url = 'view-source:' + item_url driver.get(url) time.sleep(0.5) sourcetext = driver.page_source google_id = re.findall('null,\\\\"ChIJ[a-zA-Z0-9-_+]*\\\\"', sourcetext)[0].replace('null,','').replace('\\','').replace('"','') print(google_id) table.upsert({'unique_id': unique_id,'google_id':google_id},['unique_id']) if __name__ == '__main__': main()