run3.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.common.exceptions import NoSuchElementException
  8. from selenium.webdriver.common.by import By
  9. from bs4 import BeautifulSoup
  10. from utility import database_access as DA
  11. from utility.parseutils import *
  12. from utility.connect import *
  13. from datetime import datetime
  14. import traceback
  15. import dataset
  16. import pandas as pd
  17. import time
  18. import json
  19. import re
  20. import sys
  21. import os
  22. import logging
  23. import sys
  24. from logging.handlers import SysLogHandler
  25. import socket
  26. _LOG_SERVER = ('hhh.ptt.cx', 514)
  27. logger = logging.getLogger('poibot')
  28. handler1 = SysLogHandler(address=_LOG_SERVER,socktype=socket.SOCK_DGRAM)
  29. logger.addHandler(handler1)
  30. hname=socket.gethostname()
  31. pid=str(os.getpid())
  32. logger.fatal('[poibot]['+hname+']['+pid+']begin')
  33. # import pyautogui as pag
  34. def serive_create(profilepath):
  35. if driver is not None:
  36. driver.quit()
  37. os.system('killall chrome')
  38. driver=None
  39. option = webdriver.ChromeOptions()
  40. option.add_argument('--disable-web-security')
  41. option.add_argument('--allow-running-insecure-content')
  42. option.add_argument("--user-data-dir=C:\\Users\\user\\AppData\\Local\\Google\\Chrome\\User Data")
  43. option.add_argument("profile-directory="+profilepath)
  44. driver = webdriver.Chrome('./utility/chromedriver_win32/chromedriver', options=option)
  45. executor_url = driver.command_executor._url
  46. session_id = driver.session_id
  47. print (session_id)
  48. print (executor_url)
  49. time.sleep(3)
  50. return driver
  51. def brower_start(port):
  52. logger.fatal('[poibot]['+hname+']['+pid+']browser start')
  53. options = webdriver.ChromeOptions()
  54. # browser = webdriver.Chrome(options=options)
  55. options.add_argument('--ignore-certificate-errors')
  56. options.add_argument("--no-sandbox")
  57. # options.add_argument("--headless")
  58. options.add_argument("--disable-gpu")
  59. options.add_argument("--disable-dev-shm-usage")
  60. browser = webdriver.Chrome(options=options)
  61. browser.set_window_size(1400,1000)
  62. # browser = webdriver.Remote(
  63. # command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  64. # # command_executor='http://192.53.174.202:'+str(port)+'/wd/hub',
  65. # desired_capabilities=options.to_capabilities()
  66. # )
  67. return browser
  68. def keyin_keyword(driver, keyword):
  69. button = driver.find_element_by_id("searchbox")
  70. driver.implicitly_wait(30)
  71. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  72. time.sleep(3)
  73. # element = driver.find_element_by_class_name("V0h1Ob-haAclf")
  74. # driver.implicitly_wait(30)
  75. # ActionChains(driver).move_to_element(element).click(element).perform()
  76. def open_time(driver):
  77. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
  78. if element.text.find('預訂') == -1:
  79. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
  80. driver.implicitly_wait(10)
  81. ActionChains(driver).move_to_element(element).click(element).perform()
  82. return 1
  83. else:
  84. return 0
  85. def get_shop_info(driver, output, shop_soup):
  86. # current_url_split = driver.current_url.split('@')[1].split(',')
  87. # output['lon'] = current_url_split[1]
  88. # output['lat'] = current_url_split[0]
  89. location = shop_soup.find('button',{'data-item-id':'oloc'})['aria-label'].split(' ')
  90. output['city'] = location[-1]
  91. output['area'] = location[-2]
  92. try:
  93. output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].replace('地址:', '')
  94. except:
  95. output['addr'] = ''
  96. try:
  97. output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
  98. except:
  99. output['tel'] = ''
  100. print(output['addr'], ', ' ,output['tel'])
  101. for key in element_list:
  102. try:
  103. element = element_list[key]
  104. if len(element) == 3:
  105. value = shop_soup.find(element[0],element[1])[element[2]]
  106. else:
  107. tmp_value = shop_soup.find(element[0],element[1])
  108. if tmp_value:
  109. value = tmp_value.text
  110. else:
  111. value = ''
  112. output[key] = value_check(key, value)
  113. except:
  114. output[key] = ''
  115. return output
  116. def get_intro_info(driver, output):
  117. # element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]')
  118. try:
  119. element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}簡介']".format(output['name']))
  120. driver.implicitly_wait(5)
  121. ActionChains(driver).move_to_element(element).click(element).perform()
  122. # pageSource = driver.page_source
  123. # fileToWrite = open("page_source.html", "w")
  124. # fileToWrite.write(pageSource)
  125. # fileToWrite.close()
  126. page_down_(driver, '//*[@id="pane"]/div/div[1]', 3)
  127. intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
  128. for key in intro_list:
  129. elements = intro_soup.find('div',{'aria-label':key})
  130. if elements:
  131. element = elements.find_all('li',{'class':'LQjNnc-p83tee-JNdkSc-ibnC6b'})
  132. count = 0
  133. tmp = []
  134. for ele in element:
  135. # if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/2x/check_black_18dp.png"}):
  136. if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/1x/check_black_18dp.png"}):
  137. tmp += [{
  138. 'id':count,
  139. intro_list[key][1]: blank_check(ele.text)
  140. }]
  141. count += 1
  142. print(str(tmp))
  143. output[intro_list[key][0]] = str(tmp)
  144. else:
  145. output[intro_list[key][0]] = '[]'
  146. driver.back()
  147. return output
  148. except:
  149. for key in intro_list:
  150. output[intro_list[key][0]] = '[]'
  151. return output
  152. def get_time_list(shop_soup, output):
  153. periods = []
  154. weekday_text = []
  155. open_now = blank_check(shop_soup.find('span', {'class':'LJKBpe-Tswv1b-hour-text'}).text.split('\xa0')[0])
  156. if open_now == '永久停業' or open_now == '暫時關閉':
  157. output['open_now'] = 'False'
  158. else:
  159. output['open_now'] = 'True'
  160. for tr_ in shop_soup.find_all('tr'):
  161. if tr_.find('div').text.replace(' ','') != '':
  162. week = tr_.find('div').text
  163. time_list = [blank_check(i.text) for i in tr_.find_all('li')]
  164. for time_ in time_list:
  165. if time_ == '24 小時營業':
  166. periods += [{
  167. "open":{
  168. "day": week_list[week],
  169. "time": 0000
  170. },
  171. "close":{
  172. "day": week_list[week],
  173. "time": ''
  174. }
  175. }]
  176. elif time_ == '休息':
  177. periods += [{
  178. "open":{
  179. "day": week_list[week],
  180. "time": ''
  181. },
  182. "close":{
  183. "day": week_list[week],
  184. "time": ''
  185. }
  186. }]
  187. else:
  188. start, end = time_.split('–')
  189. end_hour, end_min = end.split(':')
  190. start_hour, start_min = start.split(':')
  191. if end_hour < start_hour:
  192. end_day = week_list[week] + 1
  193. else:
  194. end_day = week_list[week]
  195. periods += [{
  196. "open":{
  197. "day": week_list[week],
  198. "time": start.replace(':','')
  199. },
  200. "close":{
  201. "day": end_day,
  202. "time": end.replace(':','')
  203. }
  204. }]
  205. weekday_text += ["{}: {}".format(week, ', '.join(time_list))]
  206. output['periods'] = str(periods)
  207. output['weekday_text'] = str(weekday_text)
  208. return output
  209. def get_reviews(driver, output):
  210. wait = WebDriverWait(driver, 30)
  211. more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
  212. wait.until(
  213. EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
  214. )
  215. element = driver.find_element_by_css_selector(more_reviews_css)
  216. driver.implicitly_wait(10)
  217. ActionChains(driver).move_to_element(element).click(element).perform()
  218. time.sleep(0.5)
  219. # page_down_(driver, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]', 5)
  220. page_down_(driver, '//div[@class="PPCwl"]',5)
  221. comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
  222. if comment_soup.find_all('div',class_='ODSEW-ShBeI-xJzy8c-bF1uUb') != 0:
  223. all_photo = driver.find_elements_by_class_name('ODSEW-ShBeI-xJzy8c-bF1uUb')
  224. for ap in all_photo:
  225. ap.click()
  226. if comment_soup.select('button[aria-label="顯示更多"]') != 0:
  227. all_review = driver.find_elements_by_css_selector('button[aria-label="顯示更多"]')
  228. for ap in all_review:
  229. ap.click()
  230. comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
  231. count = 0
  232. reviews = []
  233. for comment in comment_soup.find_all('div',{'class':'ODSEW-ShBeI'}):
  234. comment_a_tag = comment.find_all('a')
  235. author_name = blank_check(comment_a_tag[1].find('div', class_= 'ODSEW-ShBeI-title').text)
  236. profile_photo_url = comment_a_tag[0].find('img')['src']
  237. rating = blank_check(comment.find('span',{'role':'img'})['aria-label'].replace('顆星', ''))
  238. text = comment.find('div', class_='ODSEW-ShBeI-ShBeI-content').text
  239. created_at = comment.find('span', class_='ODSEW-ShBeI-RgZmSc-date').text
  240. photos = []
  241. c = 0
  242. for i in comment.find_all('button', class_='ODSEW-ShBeI-xJzy8c'):
  243. path = i['style'].split(';')[0].split('url')[1].replace('\"','').replace('(','').replace(')','')
  244. photos += [path]
  245. c += 1
  246. reviews += [{
  247. 'id': comment.find('a')['href'].split('/')[5],
  248. 'author_name': author_name,
  249. 'profile_photo_url': profile_photo_url,
  250. 'rating': int(rating),
  251. 'text': text,
  252. 'created_at': created_at,
  253. 'photos': photos
  254. }]
  255. count += 1
  256. output['reviews'] = str(reviews)
  257. driver.back()
  258. return output
  259. # def get_photo(output, shop_soup):
  260. # shop_photo = {}
  261. # for i in shop_soup.find('div',{'aria-label':'{}的相片'.format(output['name'])}).find_all('button'):
  262. # try:
  263. # if i['aria-label'] == '街景服務和 360 度相片' or i['aria-label'] == '影片':
  264. # continue
  265. # shop_photo[i['aria-label']] = i.find('img')['src']
  266. # except:
  267. # pass
  268. # output['shop_photo'] = shop_photo
  269. # return output
  270. def find_photo_list(driver):
  271. time.sleep(0.5)
  272. wait = WebDriverWait(driver, 60)
  273. wait.until(
  274. EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a'))
  275. )
  276. page_down_(driver,'//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a' , 10)
  277. photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
  278. photo_url = []
  279. count = 0
  280. for i in photo_soup.find_all('a', class_='mWq4Rd-eEDwDf'):
  281. if count > 5: break
  282. a_url = i.find('div', class_='mWq4Rd-HiaYvf-CNusmb-gevUs loaded')
  283. if a_url:
  284. if a_url.find('width') != -1:
  285. sentence = a_url['style']
  286. photo = re.search(r'https:(.*)\"', sentence)
  287. photo_url += [photo.group(0).replace('\"','')]
  288. count += 1
  289. return photo_url
  290. def find_big_photo(output, driver):
  291. # element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
  292. wait = WebDriverWait(driver, 60)
  293. wait.until(
  294. EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[1]/div[1]/button'))
  295. )
  296. element = driver.find_element(By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[1]/div[1]/button')
  297. ActionChains(driver).move_to_element(element).click(element).perform()
  298. output['shop_photo'] = '[]'
  299. output['menu_photo'] = '[]'
  300. photo_map = {
  301. '全部': 'shop_photo',
  302. '菜單': 'menu_photo'
  303. }
  304. driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='1']")
  305. photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
  306. tab_dict = {}
  307. for tab_index in [0, 1, 2]:
  308. selector = photo_soup.select("button[data-tab-index='{}']".format(tab_index))
  309. if len(selector) != 0:
  310. photo_name = selector[0].text
  311. if photo_name == '菜單':
  312. tab_dict[photo_name] = tab_index
  313. elif photo_name == '全部':
  314. tab_dict[photo_name] = tab_index
  315. print(tab_dict)
  316. for tab_ in tab_dict:
  317. tab_index = tab_dict[tab_]
  318. print(tab_index)
  319. wait = WebDriverWait(driver, 60)
  320. wait.until(
  321. EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
  322. )
  323. element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
  324. ActionChains(driver).move_to_element(element).click(element).perform()
  325. photo_list = find_photo_list(driver)
  326. output[photo_map[tab_]] = str(photo_list)
  327. return output
  328. def get_url_list(driver):
  329. # wait = WebDriverWait(driver, 10)
  330. # wait.until(
  331. # EC.element_to_be_clickable((By.XPATH, '//*[@id="sGi9mc-m5SR9c-bottom-pane"]/div/div[1]/div/div/div/div[1]/div[2]/div[2]'))
  332. # )
  333. # driver.back()
  334. time.sleep(2)
  335. for i in range(5, 43, 2):
  336. driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
  337. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  338. url_list = []
  339. for i in url_soup.find_all('a'):
  340. try:
  341. if i['href'].find('maps/place') != -1:
  342. url_list += [[i['href'], i['aria-label']]]
  343. except:
  344. pass
  345. return url_list
  346. def data_select_insert(db, table_name, table_col, data):
  347. tmp = []
  348. for name_ in table_col:
  349. if name_ == 'crawler_date':
  350. continue
  351. if name_ == 'lon' or name_ == 'lat':
  352. tmp += [float(data[name_])]
  353. else:
  354. tmp += [data[name_]]
  355. tmp += [datetime.today().strftime("%Y/%m/%d %H:%M")]
  356. insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
  357. .format(table_name, str(tuple(table_col)).replace('\'',''), tuple(tmp))
  358. DA.mysql_insert_data(db, insert_sql)
  359. def time_click(driver):
  360. shop_soup_tmp = BeautifulSoup(driver.page_source, 'html.parser')
  361. status = ''
  362. try:
  363. if len(shop_soup_tmp.select("span[aria-label='顯示本週營業時間']")) != 0:
  364. time_css = "span[aria-label='顯示本週營業時間']"
  365. element = driver.find_element_by_css_selector(time_css)
  366. driver.implicitly_wait(10)
  367. ActionChains(driver).move_to_element(element).click(element).perform()
  368. status = '正常'
  369. elif len(shop_soup_tmp.select("img[aria-label='通知']")) != 0:
  370. status = shop_soup_tmp.find('span',class_='LJKBpe-Tswv1b-text aSftqf').text
  371. # status = '永久停業' or '暫時關閉'
  372. elif len(shop_soup_tmp.select('button[aria-label*="查看更詳細的營業時間"]')) != 0:
  373. status = 'error'
  374. return status
  375. except:
  376. return ''
  377. def get_new_keyword(db):
  378. result = db.query('select distinct(keyword) from shop_item_list order by keyword')
  379. result = pd.DataFrame([i for i in result])
  380. progress = db.query('select distinct(kw) from progress_list2')
  381. progress = pd.DataFrame([i for i in progress])
  382. if len(progress) != 0:
  383. keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0].values[0]
  384. else:
  385. keyword = result.iloc[0].values[0]
  386. return keyword
  387. def get_not_cralwer_url(keyword):
  388. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  389. table = db['shop_item_list3']
  390. url_list = list(table.find(keyword=keyword))
  391. shop_item = [i['item_url'] for i in db.query('SELECT item_url FROM shop_list2 where keyword="{}"'.format(keyword))]
  392. error_item = [i['item_url'] for i in db.query('SELECT item_url FROM error_list2 where keyword="{}"'.format(keyword))]
  393. url_pd = pd.DataFrame(url_list, columns=url_list[0].keys())
  394. # url_pd['item_url_length'] = url_pd.item_url.apply(lambda x: len(x))
  395. # url_pd = url_pd[(url_pd['item_url_length']!=1000) & (url_pd['item_url_length']!=600)]
  396. url_pd = url_pd[~url_pd['item_url'].isin(shop_item)]
  397. url_pd = url_pd[~url_pd['item_url'].isin(error_item)]
  398. print('have {} URL list'.format(len(url_pd)))
  399. # url_list = pd.read_csv('result/shop_item_list_20211210.csv', index_col=0)
  400. return url_pd
  401. def serive_create_linux(profilepath):
  402. option = webdriver.ChromeOptions()
  403. option.add_argument('--headless')
  404. option.add_argument('--no-sandbox')
  405. option.add_argument('--disable-web-security')
  406. option.add_argument('--allow-running-insecure-content')
  407. option.add_argument('--incognito')
  408. option.add_argument(
  409. 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0')
  410. # option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data")
  411. option.add_argument(
  412. "--user-data-dir=/home/noodlesloves/.config/google-chrome/")
  413. option.add_argument("profile-directory="+profilepath)
  414. driver = webdriver.Chrome('utility/chromedriver', options=option)
  415. # driver = webdriver.Chrome(executable_path='/usr/bin/chromedriver', chrome_options=option,
  416. # service_args=['--verbose', '--log-path=/tmp/chromedriver.log'])
  417. executor_url = driver.command_executor._url
  418. session_id = driver.session_id
  419. print(session_id)
  420. print(executor_url)
  421. return driver
  422. def find_lon_lat(driver):
  423. e = driver.find_element_by_css_selector("#scene > div.widget-scene > canvas")
  424. size = e.size
  425. total_height = size['height']
  426. total_width = size['width']
  427. size2 = driver.find_element_by_css_selector("#pane > div.Yr7JMd-pane").size
  428. left_width = size2['width']
  429. print(total_height, total_width, left_width)
  430. x = (total_width - left_width) / 2 + left_width
  431. y = total_height / 2
  432. e = driver.find_element_by_css_selector("#pane > div.Yr7JMd-pane")
  433. action = webdriver.common.action_chains.ActionChains(driver)
  434. action.move_to_element_with_offset(e, x, y)
  435. action.context_click()
  436. action.perform()
  437. time.sleep(0.5)
  438. element = driver.find_element_by_css_selector('#action-menu > ul > li:nth-child(1)')
  439. lat, lon = element.text.split(',')
  440. return float(lat), float(lon)
  441. def get_unique_id(driver):
  442. element = driver.find_element(By.CSS_SELECTOR, "button[data-value='分享']")
  443. driver.implicitly_wait(5)
  444. ActionChains(driver).move_to_element(element).click(element).perform()
  445. time.sleep(0.5)
  446. for i in range(5):
  447. ele = driver.find_element(By.CSS_SELECTOR, "input")
  448. short_url = ele.get_attribute('value')
  449. unique_id = short_url.split('/')[-1]
  450. if len(unique_id) != 0:
  451. break
  452. time.sleep(0.5)
  453. element = driver.find_element(By.CSS_SELECTOR, "button[aria-label='關閉']")
  454. driver.implicitly_wait(5)
  455. ActionChains(driver).move_to_element(element).click(element).perform()
  456. return unique_id
  457. def page_down_(driver, xpath_css, time_):
  458. elmts = driver.find_elements_by_xpath(xpath_css)
  459. # print(xpath_css)
  460. print(elmts)
  461. # time.sleep(9999)
  462. if len(elmts)>1:
  463. elmt=elmts[1]
  464. else:
  465. elmt=elmts[0]
  466. actions = ActionChains(driver)
  467. actions.move_to_element(elmt).click().perform()
  468. for i in range(time_):
  469. try:
  470. actions = ActionChains(driver)
  471. actions.send_keys(Keys.PAGE_DOWN).perform()
  472. except:
  473. traceback.print_exc()
  474. time.sleep(0.5)
  475. def main():
  476. db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
  477. db2 = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  478. table2 = db2['swire_store_list']
  479. # keyword = '麻辣火鍋'
  480. # if len(sys.argv) >1:
  481. # keyword=sys.argv[1]
  482. # port=4444
  483. # if len(sys.argv) >2:
  484. # port=int(sys.argv[2])
  485. if len(sys.argv) > 1 :
  486. port=int(sys.argv[1])
  487. # print('restart docker p{}'.format(port))
  488. # os.system('sudo docker container restart p'+str(port))
  489. # time.sleep(8)
  490. else:
  491. port = 2
  492. for i in range(10):
  493. # result = db2.query('select * from swire_store_list where check_ is null and fid not in (select distinct fid from error_list2) ORDER BY RAND() limit 500')
  494. result = db2.query('SELECT * FROM swire_store_list a WHERE not exists (select 1 from error_list2 tei where tei.fid = a.fid limit 1 ) ORDER BY RAND() limit 500')
  495. url_pd = pd.DataFrame([dict(i) for i in result])
  496. # print(url_pd)
  497. url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
  498. # keyword = get_new_keyword(db2)
  499. # table2.insert({'kw':keyword,'num':0})
  500. # url_pd = get_not_cralwer_url(keyword)
  501. # print('drvier start {}...'.format(keyword))
  502. driver = brower_start(port)
  503. time.sleep(4)
  504. #driver = serive_create('Profile 6')
  505. #profilepath = 'Profile 1'
  506. #driver = serive_create_linux(profilepath)
  507. for key, row in url_pd.iterrows():
  508. try:
  509. name = row['name']
  510. logger.fatal('[poibot]['+hname+']['+pid+'] processing: '+name)
  511. item_url = row['item_url']
  512. print(key, name, ': ' ,item_url)
  513. print('start...')
  514. driver.get(item_url)
  515. time.sleep(9999)
  516. # page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu']", 3)
  517. page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu-haAclf']", 3)
  518. # lat, lon = find_lon_lat(driver)
  519. # unique_id = get_unique_id(driver)
  520. time_status = time_click(driver)
  521. time.sleep(0.5)
  522. shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  523. output = {
  524. # 'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text),
  525. 'name': name,
  526. 'fid': row['fid']
  527. }
  528. print(output['name'])
  529. print('get_shop_info')
  530. output = get_shop_info(driver, output, shop_soup)
  531. print('get_intro_info')
  532. if len(shop_soup.select("div[aria-label='{}簡介']".format(output['name']))) != 0:
  533. output = get_intro_info(driver, output)
  534. else:
  535. for key in intro_list:
  536. output[intro_list[key][0]] = '[]'
  537. print('get_time_list')
  538. if time_status == '正常':
  539. output = get_time_list(shop_soup, output)
  540. else:
  541. output['open_now'] = False
  542. output['periods'] = ''
  543. output['weekday_text'] = ''
  544. print('user_ratings_total')
  545. if output['user_ratings_total'] == '':
  546. output['reviews'] = ''
  547. else:
  548. output = get_reviews(driver, output)
  549. print('find_big_photo')
  550. output = find_big_photo(output, driver)
  551. output_name = output['name'].replace('(','').replace(')', '')
  552. query_name = '{}+{}'.format(output_name, output['addr'])
  553. query_name = query_name.replace(' ','')
  554. output['item_url'] = item_url
  555. output['keyword'] = row['keyword']
  556. output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
  557. data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
  558. table2.upsert({'place_id':row['place_id'],'check_':1},['place_id'])
  559. except Exception as e:
  560. traceback.print_exc()
  561. table3 = db2['error_list2']
  562. table3.insert({'fid':row['fid'],'num':row['name'],'keyword':row['keyword'],'item_url':row['item_url'],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
  563. print(e)
  564. # error_table_col = ['name', 'keyword', 'item_url', 'crawler_date']
  565. # db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
  566. # data_select_insert(db, 'error_list2', error_table_col, row)
  567. time.sleep(1)
  568. if __name__ == '__main__':
  569. main()