run3.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.common.exceptions import NoSuchElementException
  8. from selenium.webdriver.common.by import By
  9. from bs4 import BeautifulSoup
  10. from utility import database_access as DA
  11. from utility.parseutils import *
  12. from utility.connect import *
  13. from datetime import datetime
  14. import traceback
  15. import dataset
  16. import pandas as pd
  17. import time
  18. import json
  19. import re
  20. import sys
  21. import os
  22. import logging
  23. import sys
  24. from logging.handlers import SysLogHandler
  25. import socket
  26. _LOG_SERVER = ('hhh.ptt.cx', 514)
  27. logger = logging.getLogger('poibot')
  28. handler1 = SysLogHandler(address=_LOG_SERVER,socktype=socket.SOCK_DGRAM)
  29. logger.addHandler(handler1)
  30. hname=socket.gethostname()
  31. logger.debug('[poibot]['+hname+']begin')
  32. # import pyautogui as pag
  33. def serive_create(profilepath):
  34. option = webdriver.ChromeOptions()
  35. option.add_argument('--disable-web-security')
  36. option.add_argument('--allow-running-insecure-content')
  37. option.add_argument("--user-data-dir=C:\\Users\\user\\AppData\\Local\\Google\\Chrome\\User Data")
  38. option.add_argument("profile-directory="+profilepath)
  39. driver = webdriver.Chrome('./utility/chromedriver_win32/chromedriver', options=option)
  40. executor_url = driver.command_executor._url
  41. session_id = driver.session_id
  42. print (session_id)
  43. print (executor_url)
  44. time.sleep(3)
  45. return driver
  46. def brower_start(port):
  47. logger.debug('[poibot]['+hname+'] browser start')
  48. options = webdriver.ChromeOptions()
  49. # browser = webdriver.Chrome(options=options)
  50. options.add_argument('--ignore-certificate-errors')
  51. options.add_argument("--no-sandbox")
  52. options.add_argument("--headless")
  53. options.add_argument("--disable-gpu")
  54. options.add_argument("--disable-dev-shm-usage")
  55. browser = webdriver.Chrome(options=options)
  56. browser.set_window_size(1400,1000)
  57. # browser = webdriver.Remote(
  58. # command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  59. # # command_executor='http://192.53.174.202:'+str(port)+'/wd/hub',
  60. # desired_capabilities=options.to_capabilities()
  61. # )
  62. return browser
  63. def keyin_keyword(driver, keyword):
  64. button = driver.find_element_by_id("searchbox")
  65. driver.implicitly_wait(30)
  66. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  67. time.sleep(3)
  68. # element = driver.find_element_by_class_name("V0h1Ob-haAclf")
  69. # driver.implicitly_wait(30)
  70. # ActionChains(driver).move_to_element(element).click(element).perform()
  71. def open_time(driver):
  72. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
  73. if element.text.find('預訂') == -1:
  74. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
  75. driver.implicitly_wait(10)
  76. ActionChains(driver).move_to_element(element).click(element).perform()
  77. return 1
  78. else:
  79. return 0
  80. def get_shop_info(driver, output, shop_soup):
  81. # current_url_split = driver.current_url.split('@')[1].split(',')
  82. # output['lon'] = current_url_split[1]
  83. # output['lat'] = current_url_split[0]
  84. location = shop_soup.find('button',{'data-item-id':'oloc'})['aria-label'].split(' ')
  85. output['city'] = location[-1]
  86. output['area'] = location[-2]
  87. try:
  88. output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].replace('地址:', '')
  89. except:
  90. output['addr'] = ''
  91. try:
  92. output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
  93. except:
  94. output['tel'] = ''
  95. print(output['addr'], ', ' ,output['tel'])
  96. for key in element_list:
  97. try:
  98. element = element_list[key]
  99. if len(element) == 3:
  100. value = shop_soup.find(element[0],element[1])[element[2]]
  101. else:
  102. tmp_value = shop_soup.find(element[0],element[1])
  103. if tmp_value:
  104. value = tmp_value.text
  105. else:
  106. value = ''
  107. output[key] = value_check(key, value)
  108. except:
  109. output[key] = ''
  110. return output
  111. def get_intro_info(driver, output):
  112. # element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]')
  113. try:
  114. element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}簡介']".format(output['name']))
  115. driver.implicitly_wait(5)
  116. ActionChains(driver).move_to_element(element).click(element).perform()
  117. # pageSource = driver.page_source
  118. # fileToWrite = open("page_source.html", "w")
  119. # fileToWrite.write(pageSource)
  120. # fileToWrite.close()
  121. page_down_(driver, '//*[@id="pane"]/div/div[1]', 3)
  122. intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
  123. for key in intro_list:
  124. elements = intro_soup.find('div',{'aria-label':key})
  125. if elements:
  126. element = elements.find_all('li',{'class':'LQjNnc-p83tee-JNdkSc-ibnC6b'})
  127. count = 0
  128. tmp = []
  129. for ele in element:
  130. # if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/2x/check_black_18dp.png"}):
  131. if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/1x/check_black_18dp.png"}):
  132. tmp += [{
  133. 'id':count,
  134. intro_list[key][1]: blank_check(ele.text)
  135. }]
  136. count += 1
  137. print(str(tmp))
  138. output[intro_list[key][0]] = str(tmp)
  139. else:
  140. output[intro_list[key][0]] = '[]'
  141. driver.back()
  142. return output
  143. except:
  144. for key in intro_list:
  145. output[intro_list[key][0]] = '[]'
  146. return output
  147. def get_time_list(shop_soup, output):
  148. periods = []
  149. weekday_text = []
  150. open_now = blank_check(shop_soup.find('span', {'class':'LJKBpe-Tswv1b-hour-text'}).text.split('\xa0')[0])
  151. if open_now == '永久停業' or open_now == '暫時關閉':
  152. output['open_now'] = 'False'
  153. else:
  154. output['open_now'] = 'True'
  155. for tr_ in shop_soup.find_all('tr'):
  156. if tr_.find('div').text.replace(' ','') != '':
  157. week = tr_.find('div').text
  158. time_list = [blank_check(i.text) for i in tr_.find_all('li')]
  159. for time_ in time_list:
  160. if time_ == '24 小時營業':
  161. periods += [{
  162. "open":{
  163. "day": week_list[week],
  164. "time": 0000
  165. },
  166. "close":{
  167. "day": week_list[week],
  168. "time": ''
  169. }
  170. }]
  171. elif time_ == '休息':
  172. periods += [{
  173. "open":{
  174. "day": week_list[week],
  175. "time": ''
  176. },
  177. "close":{
  178. "day": week_list[week],
  179. "time": ''
  180. }
  181. }]
  182. else:
  183. start, end = time_.split('–')
  184. end_hour, end_min = end.split(':')
  185. start_hour, start_min = start.split(':')
  186. if end_hour < start_hour:
  187. end_day = week_list[week] + 1
  188. else:
  189. end_day = week_list[week]
  190. periods += [{
  191. "open":{
  192. "day": week_list[week],
  193. "time": start.replace(':','')
  194. },
  195. "close":{
  196. "day": end_day,
  197. "time": end.replace(':','')
  198. }
  199. }]
  200. weekday_text += ["{}: {}".format(week, ', '.join(time_list))]
  201. output['periods'] = str(periods)
  202. output['weekday_text'] = str(weekday_text)
  203. return output
  204. def get_reviews(driver, output):
  205. wait = WebDriverWait(driver, 30)
  206. more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
  207. wait.until(
  208. EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
  209. )
  210. element = driver.find_element_by_css_selector(more_reviews_css)
  211. driver.implicitly_wait(10)
  212. ActionChains(driver).move_to_element(element).click(element).perform()
  213. time.sleep(0.5)
  214. # page_down_(driver, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]', 5)
  215. page_down_(driver, '//div[@class="PPCwl"]',5)
  216. comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
  217. if comment_soup.find_all('div',class_='ODSEW-ShBeI-xJzy8c-bF1uUb') != 0:
  218. all_photo = driver.find_elements_by_class_name('ODSEW-ShBeI-xJzy8c-bF1uUb')
  219. for ap in all_photo:
  220. ap.click()
  221. if comment_soup.select('button[aria-label="顯示更多"]') != 0:
  222. all_review = driver.find_elements_by_css_selector('button[aria-label="顯示更多"]')
  223. for ap in all_review:
  224. ap.click()
  225. comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
  226. count = 0
  227. reviews = []
  228. for comment in comment_soup.find_all('div',{'class':'ODSEW-ShBeI'}):
  229. comment_a_tag = comment.find_all('a')
  230. author_name = blank_check(comment_a_tag[1].find('div', class_= 'ODSEW-ShBeI-title').text)
  231. profile_photo_url = comment_a_tag[0].find('img')['src']
  232. rating = blank_check(comment.find('span',{'role':'img'})['aria-label'].replace('顆星', ''))
  233. text = comment.find('div', class_='ODSEW-ShBeI-ShBeI-content').text
  234. created_at = comment.find('span', class_='ODSEW-ShBeI-RgZmSc-date').text
  235. photos = []
  236. c = 0
  237. for i in comment.find_all('button', class_='ODSEW-ShBeI-xJzy8c'):
  238. path = i['style'].split(';')[0].split('url')[1].replace('\"','').replace('(','').replace(')','')
  239. photos += [path]
  240. c += 1
  241. reviews += [{
  242. 'id': comment.find('a')['href'].split('/')[5],
  243. 'author_name': author_name,
  244. 'profile_photo_url': profile_photo_url,
  245. 'rating': int(rating),
  246. 'text': text,
  247. 'created_at': created_at,
  248. 'photos': photos
  249. }]
  250. count += 1
  251. output['reviews'] = str(reviews)
  252. driver.back()
  253. return output
  254. # def get_photo(output, shop_soup):
  255. # shop_photo = {}
  256. # for i in shop_soup.find('div',{'aria-label':'{}的相片'.format(output['name'])}).find_all('button'):
  257. # try:
  258. # if i['aria-label'] == '街景服務和 360 度相片' or i['aria-label'] == '影片':
  259. # continue
  260. # shop_photo[i['aria-label']] = i.find('img')['src']
  261. # except:
  262. # pass
  263. # output['shop_photo'] = shop_photo
  264. # return output
  265. def find_photo_list(driver):
  266. time.sleep(0.5)
  267. wait = WebDriverWait(driver, 60)
  268. wait.until(
  269. EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a'))
  270. )
  271. page_down_(driver,'//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a' , 10)
  272. photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
  273. photo_url = []
  274. count = 0
  275. for i in photo_soup.find_all('a', class_='mWq4Rd-eEDwDf'):
  276. if count > 5: break
  277. a_url = i.find('div', class_='mWq4Rd-HiaYvf-CNusmb-gevUs loaded')
  278. if a_url:
  279. if a_url.find('width') != -1:
  280. sentence = a_url['style']
  281. photo = re.search(r'https:(.*)\"', sentence)
  282. photo_url += [photo.group(0).replace('\"','')]
  283. count += 1
  284. return photo_url
  285. def find_big_photo(output, driver):
  286. # element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
  287. wait = WebDriverWait(driver, 60)
  288. wait.until(
  289. EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[1]/div[1]/button'))
  290. )
  291. element = driver.find_element(By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[1]/div[1]/button')
  292. ActionChains(driver).move_to_element(element).click(element).perform()
  293. output['shop_photo'] = '[]'
  294. output['menu_photo'] = '[]'
  295. photo_map = {
  296. '全部': 'shop_photo',
  297. '菜單': 'menu_photo'
  298. }
  299. driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='1']")
  300. photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
  301. tab_dict = {}
  302. for tab_index in [0, 1, 2]:
  303. selector = photo_soup.select("button[data-tab-index='{}']".format(tab_index))
  304. if len(selector) != 0:
  305. photo_name = selector[0].text
  306. if photo_name == '菜單':
  307. tab_dict[photo_name] = tab_index
  308. elif photo_name == '全部':
  309. tab_dict[photo_name] = tab_index
  310. print(tab_dict)
  311. for tab_ in tab_dict:
  312. tab_index = tab_dict[tab_]
  313. print(tab_index)
  314. wait = WebDriverWait(driver, 60)
  315. wait.until(
  316. EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
  317. )
  318. element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
  319. ActionChains(driver).move_to_element(element).click(element).perform()
  320. photo_list = find_photo_list(driver)
  321. output[photo_map[tab_]] = str(photo_list)
  322. return output
  323. def get_url_list(driver):
  324. # wait = WebDriverWait(driver, 10)
  325. # wait.until(
  326. # EC.element_to_be_clickable((By.XPATH, '//*[@id="sGi9mc-m5SR9c-bottom-pane"]/div/div[1]/div/div/div/div[1]/div[2]/div[2]'))
  327. # )
  328. # driver.back()
  329. time.sleep(2)
  330. for i in range(5, 43, 2):
  331. driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
  332. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  333. url_list = []
  334. for i in url_soup.find_all('a'):
  335. try:
  336. if i['href'].find('maps/place') != -1:
  337. url_list += [[i['href'], i['aria-label']]]
  338. except:
  339. pass
  340. return url_list
  341. def data_select_insert(db, table_name, table_col, data):
  342. tmp = []
  343. for name_ in table_col:
  344. if name_ == 'crawler_date':
  345. continue
  346. if name_ == 'lon' or name_ == 'lat':
  347. tmp += [float(data[name_])]
  348. else:
  349. tmp += [data[name_]]
  350. tmp += [datetime.today().strftime("%Y/%m/%d %H:%M")]
  351. insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
  352. .format(table_name, str(tuple(table_col)).replace('\'',''), tuple(tmp))
  353. DA.mysql_insert_data(db, insert_sql)
  354. def time_click(driver):
  355. shop_soup_tmp = BeautifulSoup(driver.page_source, 'html.parser')
  356. status = ''
  357. try:
  358. if len(shop_soup_tmp.select("span[aria-label='顯示本週營業時間']")) != 0:
  359. time_css = "span[aria-label='顯示本週營業時間']"
  360. element = driver.find_element_by_css_selector(time_css)
  361. driver.implicitly_wait(10)
  362. ActionChains(driver).move_to_element(element).click(element).perform()
  363. status = '正常'
  364. elif len(shop_soup_tmp.select("img[aria-label='通知']")) != 0:
  365. status = shop_soup_tmp.find('span',class_='LJKBpe-Tswv1b-text aSftqf').text
  366. # status = '永久停業' or '暫時關閉'
  367. elif len(shop_soup_tmp.select('button[aria-label*="查看更詳細的營業時間"]')) != 0:
  368. status = 'error'
  369. return status
  370. except:
  371. return ''
  372. def get_new_keyword(db):
  373. result = db.query('select distinct(keyword) from shop_item_list order by keyword')
  374. result = pd.DataFrame([i for i in result])
  375. progress = db.query('select distinct(kw) from progress_list2')
  376. progress = pd.DataFrame([i for i in progress])
  377. if len(progress) != 0:
  378. keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0].values[0]
  379. else:
  380. keyword = result.iloc[0].values[0]
  381. return keyword
  382. def get_not_cralwer_url(keyword):
  383. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  384. table = db['shop_item_list3']
  385. url_list = list(table.find(keyword=keyword))
  386. shop_item = [i['item_url'] for i in db.query('SELECT item_url FROM shop_list2 where keyword="{}"'.format(keyword))]
  387. error_item = [i['item_url'] for i in db.query('SELECT item_url FROM error_list2 where keyword="{}"'.format(keyword))]
  388. url_pd = pd.DataFrame(url_list, columns=url_list[0].keys())
  389. # url_pd['item_url_length'] = url_pd.item_url.apply(lambda x: len(x))
  390. # url_pd = url_pd[(url_pd['item_url_length']!=1000) & (url_pd['item_url_length']!=600)]
  391. url_pd = url_pd[~url_pd['item_url'].isin(shop_item)]
  392. url_pd = url_pd[~url_pd['item_url'].isin(error_item)]
  393. print('have {} URL list'.format(len(url_pd)))
  394. # url_list = pd.read_csv('result/shop_item_list_20211210.csv', index_col=0)
  395. return url_pd
  396. def serive_create_linux(profilepath):
  397. option = webdriver.ChromeOptions()
  398. option.add_argument('--headless')
  399. option.add_argument('--no-sandbox')
  400. option.add_argument('--disable-web-security')
  401. option.add_argument('--allow-running-insecure-content')
  402. option.add_argument('--incognito')
  403. option.add_argument(
  404. 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0')
  405. # option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data")
  406. option.add_argument(
  407. "--user-data-dir=/home/noodlesloves/.config/google-chrome/")
  408. option.add_argument("profile-directory="+profilepath)
  409. driver = webdriver.Chrome('utility/chromedriver', options=option)
  410. # driver = webdriver.Chrome(executable_path='/usr/bin/chromedriver', chrome_options=option,
  411. # service_args=['--verbose', '--log-path=/tmp/chromedriver.log'])
  412. executor_url = driver.command_executor._url
  413. session_id = driver.session_id
  414. print(session_id)
  415. print(executor_url)
  416. return driver
  417. def find_lon_lat(driver):
  418. e = driver.find_element_by_css_selector("#scene > div.widget-scene > canvas")
  419. size = e.size
  420. total_height = size['height']
  421. total_width = size['width']
  422. size2 = driver.find_element_by_css_selector("#pane > div.Yr7JMd-pane").size
  423. left_width = size2['width']
  424. print(total_height, total_width, left_width)
  425. x = (total_width - left_width) / 2 + left_width
  426. y = total_height / 2
  427. e = driver.find_element_by_css_selector("#pane > div.Yr7JMd-pane")
  428. action = webdriver.common.action_chains.ActionChains(driver)
  429. action.move_to_element_with_offset(e, x, y)
  430. action.context_click()
  431. action.perform()
  432. time.sleep(0.5)
  433. element = driver.find_element_by_css_selector('#action-menu > ul > li:nth-child(1)')
  434. lat, lon = element.text.split(',')
  435. return float(lat), float(lon)
  436. def get_unique_id(driver):
  437. element = driver.find_element(By.CSS_SELECTOR, "button[data-value='分享']")
  438. driver.implicitly_wait(5)
  439. ActionChains(driver).move_to_element(element).click(element).perform()
  440. time.sleep(0.5)
  441. for i in range(5):
  442. ele = driver.find_element(By.CSS_SELECTOR, "input")
  443. short_url = ele.get_attribute('value')
  444. unique_id = short_url.split('/')[-1]
  445. if len(unique_id) != 0:
  446. break
  447. time.sleep(0.5)
  448. element = driver.find_element(By.CSS_SELECTOR, "button[aria-label='關閉']")
  449. driver.implicitly_wait(5)
  450. ActionChains(driver).move_to_element(element).click(element).perform()
  451. return unique_id
  452. def page_down_(driver, xpath_css, time_):
  453. elmts = driver.find_elements_by_xpath(xpath_css)
  454. # print(xpath_css)
  455. print(elmts)
  456. # time.sleep(9999)
  457. if len(elmts)>1:
  458. elmt=elmts[1]
  459. else:
  460. elmt=elmts[0]
  461. actions = ActionChains(driver)
  462. actions.move_to_element(elmt).click().perform()
  463. for i in range(time_):
  464. try:
  465. actions = ActionChains(driver)
  466. actions.send_keys(Keys.PAGE_DOWN).perform()
  467. except:
  468. traceback.print_exc()
  469. time.sleep(0.5)
  470. def main():
  471. db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
  472. db2 = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  473. table2 = db2['swire_store_list']
  474. # keyword = '麻辣火鍋'
  475. # if len(sys.argv) >1:
  476. # keyword=sys.argv[1]
  477. # port=4444
  478. # if len(sys.argv) >2:
  479. # port=int(sys.argv[2])
  480. if len(sys.argv) > 1 :
  481. port=int(sys.argv[1])
  482. print('restart docker p{}'.format(port))
  483. os.system('sudo docker container restart p'+str(port))
  484. time.sleep(8)
  485. else:
  486. port = 2
  487. for i in range(10):
  488. result = db2.query('select * from swire_store_list where check_ is null ORDER BY RAND() limit 500')
  489. url_pd = pd.DataFrame([dict(i) for i in result])
  490. url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
  491. # keyword = get_new_keyword(db2)
  492. # table2.insert({'kw':keyword,'num':0})
  493. # url_pd = get_not_cralwer_url(keyword)
  494. # print('drvier start {}...'.format(keyword))
  495. driver = brower_start(port)
  496. time.sleep(4)
  497. #driver = serive_create('Profile 6')
  498. #profilepath = 'Profile 1'
  499. #driver = serive_create_linux(profilepath)
  500. for key, row in url_pd.iterrows():
  501. try:
  502. name = row['name']
  503. item_url = row['item_url']
  504. print(key, name, ': ' ,item_url)
  505. print('start...')
  506. driver.get(item_url)
  507. # page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu']", 3)
  508. page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu-haAclf']", 3)
  509. # lat, lon = find_lon_lat(driver)
  510. # unique_id = get_unique_id(driver)
  511. time_status = time_click(driver)
  512. time.sleep(0.5)
  513. shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  514. output = {
  515. # 'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text),
  516. 'name': name,
  517. 'fid': row['fid']
  518. }
  519. print(output['name'])
  520. print('get_shop_info')
  521. output = get_shop_info(driver, output, shop_soup)
  522. print('get_intro_info')
  523. if len(shop_soup.select("div[aria-label='{}簡介']".format(output['name']))) != 0:
  524. output = get_intro_info(driver, output)
  525. else:
  526. for key in intro_list:
  527. output[intro_list[key][0]] = '[]'
  528. print('get_time_list')
  529. if time_status == '正常':
  530. output = get_time_list(shop_soup, output)
  531. else:
  532. output['open_now'] = False
  533. output['periods'] = ''
  534. output['weekday_text'] = ''
  535. print('user_ratings_total')
  536. if output['user_ratings_total'] == '':
  537. output['reviews'] = ''
  538. else:
  539. output = get_reviews(driver, output)
  540. print('find_big_photo')
  541. output = find_big_photo(output, driver)
  542. output_name = output['name'].replace('(','').replace(')', '')
  543. query_name = '{}+{}'.format(output_name, output['addr'])
  544. query_name = query_name.replace(' ','')
  545. output['item_url'] = item_url
  546. output['keyword'] = row['keyword']
  547. output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
  548. data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
  549. table2.upsert({'place_id':row['place_id'],'check_':1},['place_id'])
  550. except Exception as e:
  551. table3 = db2['error_list2']
  552. table3.insert({'num':row['name'],'keyword':row['keyword'],'item_url':row['item_url'],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
  553. print(e)
  554. # error_table_col = ['name', 'keyword', 'item_url', 'crawler_date']
  555. # db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
  556. # data_select_insert(db, 'error_list2', error_table_col, row)
  557. time.sleep(1)
  558. if __name__ == '__main__':
  559. main()