| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195 | 
							- # -*- coding: utf-8 -*-
 
- from selenium import webdriver
 
- from selenium.webdriver.common.action_chains import ActionChains
 
- from selenium.webdriver.common.keys import Keys
 
- from selenium.webdriver.support import expected_conditions as EC
 
- from selenium.webdriver.support.wait import WebDriverWait
 
- from selenium.webdriver.common.by import By
 
- import selenium
 
- import traceback
 
- from bs4 import BeautifulSoup
 
- from utility import database_access as DA
 
- from utility.parseutils import *
 
- from utility.connect import *
 
- from datetime import datetime
 
- import pandas as pd
 
- import dataset
 
- import time
 
- import json
 
- import re
 
- import sys, os
 
- def brower_start(port):
 
-     options = webdriver.ChromeOptions()
 
-     browser = webdriver.Remote(
 
-         #command_executor='http://192.53.174.202:4444/wd/hub',
 
-         command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
 
-         desired_capabilities=options.to_capabilities()
 
-     )
 
-     return browser
 
- def page_down_(driver, xpath_css, time_):
 
-     elmts = driver.find_elements_by_xpath(xpath_css)
 
-     print(elmts)
 
-     if len(elmts)>1:
 
-         elmt=elmts[1]
 
-     else:
 
-         elmt=elmts[0]
 
-     actions = ActionChains(driver)
 
-     actions.move_to_element(elmt).click().perform()
 
-     for i in range(time_):
 
-         try:
 
-             actions = ActionChains(driver)
 
-             actions.send_keys(Keys.PAGE_DOWN).perform()
 
-         except:
 
-             traceback.print_exc()
 
-         time.sleep(0.5)
 
- def get_url_list(driver):
 
-     # for i in range(5, 43, 2):
 
-     #     try:
 
-     #         wait = WebDriverWait(driver, 60)
 
-     #         wait.until(
 
-     #             EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)))
 
-     #         )
 
-     #         driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
 
-     #         time.sleep(0.5)
 
-     #     except:
 
-     #         pass
 
-     # wait = WebDriverWait(driver, 30)
 
-     # try:
 
-     #     wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ppdPk-Ej1Yeb-LgbsSe-tJiF1e"]')))
 
-     # except selenium.common.exceptions.TimeoutException:
 
-     #     traceback.print_exc()
 
-     #     return "EMPTY"
 
-     page_down_(driver, '//div[@class="TFQHme"]', 8)
 
-     url_soup = BeautifulSoup(driver.page_source, 'html.parser')
 
-     url_list = []
 
-     for i in url_soup.find_all('a'):
 
-         try:
 
-             if i['href'].find('maps/place') != -1:
 
-                 url_list += [[i['href'], i['aria-label']]]
 
-         except:
 
-             pass
 
-     # print(len(url_list))
 
-     return url_list
 
- def keyin_keyword(driver, keyword):
 
-     button = driver.find_element_by_id("searchbox")
 
-     driver.implicitly_wait(30)
 
-     ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
 
-     time.sleep(3)
 
- # def get_crawler_list(db):
 
-     
 
- #     result = db.query('select keyword, count(*) from shop_item_list group by keyword')
 
- #     result = pd.DataFrame([i for i in result])
 
- #     result.columns = ['keyword', 'count']
 
- #     result = result[result['count'] < 100]
 
- #     keyword = result.sample(1).iloc[0]['keyword']
 
-     
 
- #     num=0
 
- #     cursor=db.query('select num from progress_list2 where kw = "'+keyword+'"')
 
- #     for c in cursor:
 
- #         num=c['num']
 
- #         break
 
- #     cursor=db.query('select * from lat_lon_loc where num >= '+str(num))
 
- #     #    cursor=db.query('select * from lat_lon_loc')
 
- #     lst=[]
 
- #     for c in cursor:
 
- #         lst.append({'num':c['num'],'loc':c['loc'],'lat':c['lat'],'lon':c['lon']})
 
-         
 
- #     return keyword, lst
 
-     
 
- def get_crawler_list(db):
 
-     result = db.query('select distinct(keyword) from shop_item_list order by keyword')
 
-     result = pd.DataFrame([i for i in result])
 
-     progress = db.query('select distinct(kw) from progress_list2')
 
-     progress = pd.DataFrame([i for i in progress])
 
-     if len(progress) != 0:
 
-         keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0].values[0]
 
-     else:
 
-         keyword = result.iloc[0].values[0]
 
-         
 
-     return keyword
 
- def main():
 
- #     data = pd.read_csv('lat_long_location.csv', index_col = 0)
 
- #     keyword = '麻辣火鍋'
 
-     lon_lat = [[121.567,25.038], [121.567,25.046], [121.543,25.046], [121.543,25.038]]
 
-     port=4447
 
-     if len(sys.argv) > 1 :
 
-         port=int(sys.argv[1])
 
-         print('restart docker p{}'.format(port))
 
-         os.system('sudo docker container restart pp'+str(port))
 
-         time.sleep(8)
 
- #     if len(sys.argv) >2:
 
- #         port=int(sys.argv[2])
 
-     print('drvier start...')
 
-     driver = brower_start(port)
 
-     #     db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
 
-     db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
 
-     table=db['shop_item_list3']
 
-     table2=db['progress_list2']
 
-     for i in range(20):
 
-         try:
 
-             keyword  = get_crawler_list(db)
 
-             print(keyword)
 
-             c = 0
 
-             for row in lon_lat:
 
-                 c += 1
 
-                 # latitude = row['lat'] #緯度
 
-                 # longitude = row['lon'] #精度
 
-                 latitude = row[1] #緯度
 
-                 longitude = row[0] #精度
 
-                 # table2.upsert({'kw':keyword,'num':row['num']},['kw'])
 
-                 table2.insert({'kw':keyword,'num':c})
 
-                 url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude, longitude)
 
-                 driver.get(url)
 
-                 keyin_keyword(driver, keyword)
 
-                 failcnt = 0
 
-                 for page in range(5):
 
-                     print(keyword, latitude, longitude, page)
 
-                     url_list = get_url_list(driver)
 
-                     duplicate = 0
 
-                     # shop_item_list_col = ['name','lon','lat','keyword','item_url','crawler_date']
 
-                     for item in url_list:
 
-                         try:
 
-                             table.insert({'name':item[1],'lon':longitude, 'lat':latitude, \
 
-                                         'keyword':keyword, 'item_url':item[0],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
 
-                         except:
 
-                             duplicate += 1
 
-                     print(len(url_list), duplicate)
 
-         #                     result = [item[1], longitude, latitude, keyword, item[0], datetime.today().strftime("%Y/%m/%d %H:%M")]
 
-         #                     insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
 
-         #                                     .format('shop_item_list', str(tuple(shop_item_list_col)).replace('\'',''), tuple(result))
 
-         #                     DA.mysql_insert_data(db, insert_sql)
 
-                     if page < 2 :
 
-                         element = driver.find_element_by_id('ppdPk-Ej1Yeb-LgbsSe-tJiF1e')
 
-                         driver.implicitly_wait(30)
 
-                         ActionChains(driver).move_to_element(element).click(element).perform() 
 
-         except:
 
-             pass
 
- if __name__ == '__main__':
 
-     main()
 
 
  |