jared 3 years ago
parent
commit
fe1ac05e68
1 changed files with 138 additions and 0 deletions
  1. 138 0
      jared_pureselenium_shop_item_list.py

+ 138 - 0
jared_pureselenium_shop_item_list.py

@@ -0,0 +1,138 @@
+# -*- coding: utf-8 -*-
+from selenium import webdriver
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.common.by import By
+
+from bs4 import BeautifulSoup
+
+from utility import database_access as DA
+from utility.parseutils import *
+from utility.connect import *
+import dataset
+import sys
+from datetime import datetime
+import pandas as pd
+import time
+import traceback
+import json
+import re
+
+def brower_start(port):
+    options = webdriver.ChromeOptions()
+
+    browser = webdriver.Chrome(options=options)
+#    上面成功再來用docker
+#    browser = webdriver.Remote(
+#        command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
+#        desired_capabilities=options.to_capabilities()
+#    )
+    return browser
+
+
+def get_url_list(driver):
+    wait = WebDriverWait(driver, 60)
+    wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ppdPk-Ej1Yeb-LgbsSe-tJiF1e"]')))
+
+
+#    elmts=driver.find_elements_by_xpath("//div[contains(@class,'siAUzd-neVct section-scrollbox') and not( contains(@role,'region') )]")
+
+    elmts=driver.find_elements_by_xpath("//div[@class='siAUzd-neVct section-scrollbox cYB2Ge-oHo7ed cYB2Ge-ti6hGc siAUzd-neVct-Q3DXx-BvBYQ']")
+
+    print(elmts)
+    if len(elmts)>1:
+        elmt=elmts[1]
+    else:
+        elmt=elmts[0]
+
+#    webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
+
+    for i in range(8):
+        try:
+#            print(elmt)
+#            print('before send key')
+            elmt.send_keys(Keys.PAGE_DOWN)
+        except:
+#            print('exception')
+            traceback.print_exc()
+#        print('after send key')
+        time.sleep(0.5)
+
+    url_soup = BeautifulSoup(driver.page_source, 'html.parser')
+    url_list = []
+    for i in url_soup.find_all('a'):
+        try:
+            if i['href'].find('maps/place') != -1:
+                url_list += [[i['href'], i['aria-label']]]
+        except:
+            pass
+    return url_list
+
+
+def keyin_keyword(driver, keyword):
+    button = driver.find_element_by_id("searchbox")
+    driver.implicitly_wait(30)
+    ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
+    time.sleep(3)
+
+
+def main():
+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
+
+
+    keyword = '虱目魚'
+    if len(sys.argv) >1:
+        keyword=sys.argv[1]
+    port=4444
+    if len(sys.argv) >2:
+        port=int(sys.argv[2])
+
+    print('drvier start...')
+    driver = brower_start(port)
+
+
+    num=0
+    cursor=db.query('select num from progress_list where kw = "'+keyword+'"')
+    for c in cursor:
+        num=c['num']
+        break
+
+    table2=db['progress_list']
+
+
+    cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
+#    cursor=db.query('select * from lat_lon_loc')
+    lst=[]
+    for c in cursor:
+        lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
+
+
+
+    for r in lst:
+
+        latitude = r['lat'] #緯度
+        longitude = r['lon'] #精度
+        table2.upsert({'kw':keyword,'num':r['num']},['kw'])
+        url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
+        driver.get(url)
+
+        keyin_keyword(driver, keyword)
+        
+        for page in range(4):
+            print( r['loc'], latitude, longitude, page)
+            url_list = get_url_list(driver)
+            print(url_list)
+            shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
+            for item in url_list:
+                result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
+                print(result)
+            
+            if page < 2 :
+                element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
+                driver.implicitly_wait(30)
+                ActionChains(driver).move_to_element(element).click(element).perform() 
+
+if __name__ == '__main__':
+    main()