1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162 |
- # -*- coding: utf-8 -*-
- from selenium import webdriver
- from selenium.webdriver.common.action_chains import ActionChains
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.support import expected_conditions as EC
- from selenium.webdriver.support.wait import WebDriverWait
- from selenium.webdriver.common.by import By
- from bs4 import BeautifulSoup
- from utility import database_access as DA
- from utility.parseutils import *
- from utility.connect import *
- import dataset
- import sys
- from datetime import datetime
- import pandas as pd
- import time
- import json
- import re, os
- def brower_start(port):
- options = webdriver.ChromeOptions()
- # browser = webdriver.Chrome(options=options)
- browser = webdriver.Remote(
- command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
- desired_capabilities=options.to_capabilities()
- )
- return browser
- def main():
- port=4444
- if len(sys.argv) >1:
- port=int(sys.argv[1])
- print('restart docker p{}'.format(port))
- os.system('sudo docker container restart p'+str(port))
- time.sleep(8)
- print('drvier start...')
- driver = brower_start(port)
- db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
- table=db['shop_list2']
- for i in range(1):
- result = db.query('select * from shop_list2 where google_id is null ORDER BY RAND() limit 20')
- result = pd.DataFrame([i for i in result])
- for key, group in result.iterrows():
- unique_id = group['unique_id']
- item_url = group['item_url']
- url = url = 'view-source:' + item_url
- driver.get(url)
- time.sleep(0.5)
- sourcetext = driver.page_source
- google_id = re.findall('null,\\\\"ChIJ[a-zA-Z0-9-_+]*\\\\"', sourcetext)[0].replace('null,','').replace('\\','').replace('"','')
- print(google_id)
- table.upsert({'unique_id': unique_id,'google_id':google_id},['unique_id'])
-
- if __name__ == '__main__':
- main()
|