run.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.common.exceptions import NoSuchElementException
  8. from selenium.webdriver.common.by import By
  9. from bs4 import BeautifulSoup
  10. from utility import database_access as DA
  11. from utility.parseutils import *
  12. from utility.connect import *
  13. from datetime import datetime
  14. import traceback
  15. import dataset
  16. import pandas as pd
  17. import time
  18. import json
  19. import re
  20. import sys
  21. import os
  22. # import pyautogui as pag
  23. def serive_create(profilepath):
  24. option = webdriver.ChromeOptions()
  25. option.add_argument('--disable-web-security')
  26. option.add_argument('--allow-running-insecure-content')
  27. option.add_argument("--user-data-dir=C:\\Users\\user\\AppData\\Local\\Google\\Chrome\\User Data")
  28. option.add_argument("profile-directory="+profilepath)
  29. driver = webdriver.Chrome('./utility/chromedriver_win32/chromedriver', options=option)
  30. executor_url = driver.command_executor._url
  31. session_id = driver.session_id
  32. print (session_id)
  33. print (executor_url)
  34. time.sleep(3)
  35. return driver
  36. def brower_start(port):
  37. options = webdriver.ChromeOptions()
  38. # browser = webdriver.Chrome(options=options)
  39. browser = webdriver.Remote(
  40. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  41. # command_executor='http://192.53.174.202:'+str(port)+'/wd/hub',
  42. desired_capabilities=options.to_capabilities()
  43. )
  44. return browser
  45. def keyin_keyword(driver, keyword):
  46. button = driver.find_element_by_id("searchbox")
  47. driver.implicitly_wait(30)
  48. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  49. time.sleep(3)
  50. # element = driver.find_element_by_class_name("V0h1Ob-haAclf")
  51. # driver.implicitly_wait(30)
  52. # ActionChains(driver).move_to_element(element).click(element).perform()
  53. def open_time(driver):
  54. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
  55. if element.text.find('預訂') == -1:
  56. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
  57. driver.implicitly_wait(10)
  58. ActionChains(driver).move_to_element(element).click(element).perform()
  59. return 1
  60. else:
  61. return 0
  62. def get_shop_info(driver, output, shop_soup):
  63. # current_url_split = driver.current_url.split('@')[1].split(',')
  64. # output['lon'] = current_url_split[1]
  65. # output['lat'] = current_url_split[0]
  66. location = shop_soup.find('button',{'data-item-id':'oloc'})['aria-label'].split(' ')
  67. output['city'] = location[-1]
  68. output['area'] = location[-2]
  69. try:
  70. output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].replace('地址:', '')
  71. except:
  72. output['addr'] = ''
  73. try:
  74. output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
  75. except:
  76. output['tel'] = ''
  77. print(output['addr'], ', ' ,output['tel'])
  78. for key in element_list:
  79. try:
  80. element = element_list[key]
  81. if len(element) == 3:
  82. value = shop_soup.find(element[0],element[1])[element[2]]
  83. else:
  84. tmp_value = shop_soup.find(element[0],element[1])
  85. if tmp_value:
  86. value = tmp_value.text
  87. else:
  88. value = ''
  89. output[key] = value_check(key, value)
  90. except:
  91. output[key] = ''
  92. return output
  93. def get_intro_info(driver, output):
  94. # element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]')
  95. try:
  96. element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}簡介']".format(output['name']))
  97. driver.implicitly_wait(5)
  98. ActionChains(driver).move_to_element(element).click(element).perform()
  99. # pageSource = driver.page_source
  100. # fileToWrite = open("page_source.html", "w")
  101. # fileToWrite.write(pageSource)
  102. # fileToWrite.close()
  103. page_down_(driver, '//*[@id="pane"]/div/div[1]', 3)
  104. intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
  105. for key in intro_list:
  106. elements = intro_soup.find('div',{'aria-label':key})
  107. if elements:
  108. element = elements.find_all('li',{'class':'LQjNnc-p83tee-JNdkSc-ibnC6b'})
  109. count = 0
  110. tmp = []
  111. for ele in element:
  112. # if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/2x/check_black_18dp.png"}):
  113. if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/1x/check_black_18dp.png"}):
  114. tmp += [{
  115. 'id':count,
  116. intro_list[key][1]: blank_check(ele.text)
  117. }]
  118. count += 1
  119. print(str(tmp))
  120. output[intro_list[key][0]] = str(tmp)
  121. else:
  122. output[intro_list[key][0]] = '[]'
  123. driver.back()
  124. return output
  125. except:
  126. for key in intro_list:
  127. output[intro_list[key][0]] = '[]'
  128. return output
  129. def get_time_list(shop_soup, output):
  130. periods = []
  131. weekday_text = []
  132. open_now = blank_check(shop_soup.find('span', {'class':'LJKBpe-Tswv1b-hour-text'}).text.split('\xa0')[0])
  133. if open_now == '永久停業' or open_now == '暫時關閉':
  134. output['open_now'] = 'False'
  135. else:
  136. output['open_now'] = 'True'
  137. for tr_ in shop_soup.find_all('tr'):
  138. if tr_.find('div').text.replace(' ','') != '':
  139. week = tr_.find('div').text
  140. time_list = [blank_check(i.text) for i in tr_.find_all('li')]
  141. for time_ in time_list:
  142. if time_ == '24 小時營業':
  143. periods += [{
  144. "open":{
  145. "day": week_list[week],
  146. "time": 0000
  147. },
  148. "close":{
  149. "day": week_list[week],
  150. "time": ''
  151. }
  152. }]
  153. elif time_ == '休息':
  154. periods += [{
  155. "open":{
  156. "day": week_list[week],
  157. "time": ''
  158. },
  159. "close":{
  160. "day": week_list[week],
  161. "time": ''
  162. }
  163. }]
  164. else:
  165. start, end = time_.split('–')
  166. end_hour, end_min = end.split(':')
  167. start_hour, start_min = start.split(':')
  168. if end_hour < start_hour:
  169. end_day = week_list[week] + 1
  170. else:
  171. end_day = week_list[week]
  172. periods += [{
  173. "open":{
  174. "day": week_list[week],
  175. "time": start.replace(':','')
  176. },
  177. "close":{
  178. "day": end_day,
  179. "time": end.replace(':','')
  180. }
  181. }]
  182. weekday_text += ["{}: {}".format(week, ', '.join(time_list))]
  183. output['periods'] = str(periods)
  184. output['weekday_text'] = str(weekday_text)
  185. return output
  186. def get_reviews(driver, output):
  187. wait = WebDriverWait(driver, 30)
  188. more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
  189. wait.until(
  190. EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
  191. )
  192. element = driver.find_element_by_css_selector(more_reviews_css)
  193. driver.implicitly_wait(10)
  194. ActionChains(driver).move_to_element(element).click(element).perform()
  195. time.sleep(0.5)
  196. # page_down_(driver, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]', 5)
  197. page_down_(driver, '//div[@class="PPCwl"]',5)
  198. comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
  199. if comment_soup.find_all('div',class_='ODSEW-ShBeI-xJzy8c-bF1uUb') != 0:
  200. all_photo = driver.find_elements_by_class_name('ODSEW-ShBeI-xJzy8c-bF1uUb')
  201. for ap in all_photo:
  202. ap.click()
  203. if comment_soup.select('button[aria-label="顯示更多"]') != 0:
  204. all_review = driver.find_elements_by_css_selector('button[aria-label="顯示更多"]')
  205. for ap in all_review:
  206. ap.click()
  207. comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
  208. count = 0
  209. reviews = []
  210. for comment in comment_soup.find_all('div',{'class':'ODSEW-ShBeI'}):
  211. comment_a_tag = comment.find_all('a')
  212. author_name = blank_check(comment_a_tag[1].find('div', class_= 'ODSEW-ShBeI-title').text)
  213. profile_photo_url = comment_a_tag[0].find('img')['src']
  214. rating = blank_check(comment.find('span',{'role':'img'})['aria-label'].replace('顆星', ''))
  215. text = comment.find('div', class_='ODSEW-ShBeI-ShBeI-content').text
  216. created_at = comment.find('span', class_='ODSEW-ShBeI-RgZmSc-date').text
  217. photos = []
  218. c = 0
  219. for i in comment.find_all('button', class_='ODSEW-ShBeI-xJzy8c'):
  220. path = i['style'].split(';')[0].split('url')[1].replace('\"','').replace('(','').replace(')','')
  221. photos += [path]
  222. c += 1
  223. reviews += [{
  224. 'id': comment.find('a')['href'].split('/')[5],
  225. 'author_name': author_name,
  226. 'profile_photo_url': profile_photo_url,
  227. 'rating': int(rating),
  228. 'text': text,
  229. 'created_at': created_at,
  230. 'photos': photos
  231. }]
  232. count += 1
  233. output['reviews'] = str(reviews)
  234. driver.back()
  235. return output
  236. # def get_photo(output, shop_soup):
  237. # shop_photo = {}
  238. # for i in shop_soup.find('div',{'aria-label':'{}的相片'.format(output['name'])}).find_all('button'):
  239. # try:
  240. # if i['aria-label'] == '街景服務和 360 度相片' or i['aria-label'] == '影片':
  241. # continue
  242. # shop_photo[i['aria-label']] = i.find('img')['src']
  243. # except:
  244. # pass
  245. # output['shop_photo'] = shop_photo
  246. # return output
  247. def find_photo_list(driver):
  248. time.sleep(0.5)
  249. wait = WebDriverWait(driver, 60)
  250. wait.until(
  251. EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a'))
  252. )
  253. page_down_(driver,'//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a' , 10)
  254. photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
  255. photo_url = []
  256. count = 0
  257. for i in photo_soup.find_all('a', class_='mWq4Rd-eEDwDf'):
  258. if count > 5: break
  259. a_url = i.find('div', class_='mWq4Rd-HiaYvf-CNusmb-gevUs loaded')
  260. if a_url:
  261. if a_url.find('width') != -1:
  262. sentence = a_url['style']
  263. photo = re.search(r'https:(.*)\"', sentence)
  264. photo_url += [photo.group(0).replace('\"','')]
  265. count += 1
  266. return photo_url
  267. def find_big_photo(output, driver):
  268. # element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
  269. wait = WebDriverWait(driver, 60)
  270. wait.until(
  271. EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[1]/div[1]/button'))
  272. )
  273. element = driver.find_element(By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[1]/div[1]/button')
  274. ActionChains(driver).move_to_element(element).click(element).perform()
  275. output['shop_photo'] = '[]'
  276. output['menu_photo'] = '[]'
  277. photo_map = {
  278. '全部': 'shop_photo',
  279. '菜單': 'menu_photo'
  280. }
  281. driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='1']")
  282. photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
  283. tab_dict = {}
  284. for tab_index in [0, 1, 2]:
  285. selector = photo_soup.select("button[data-tab-index='{}']".format(tab_index))
  286. if len(selector) != 0:
  287. photo_name = selector[0].text
  288. if photo_name == '菜單':
  289. tab_dict[photo_name] = tab_index
  290. elif photo_name == '全部':
  291. tab_dict[photo_name] = tab_index
  292. print(tab_dict)
  293. for tab_ in tab_dict:
  294. tab_index = tab_dict[tab_]
  295. print(tab_index)
  296. wait = WebDriverWait(driver, 60)
  297. wait.until(
  298. EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
  299. )
  300. element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
  301. ActionChains(driver).move_to_element(element).click(element).perform()
  302. photo_list = find_photo_list(driver)
  303. output[photo_map[tab_]] = str(photo_list)
  304. return output
  305. def get_url_list(driver):
  306. # wait = WebDriverWait(driver, 10)
  307. # wait.until(
  308. # EC.element_to_be_clickable((By.XPATH, '//*[@id="sGi9mc-m5SR9c-bottom-pane"]/div/div[1]/div/div/div/div[1]/div[2]/div[2]'))
  309. # )
  310. # driver.back()
  311. time.sleep(2)
  312. for i in range(5, 43, 2):
  313. driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
  314. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  315. url_list = []
  316. for i in url_soup.find_all('a'):
  317. try:
  318. if i['href'].find('maps/place') != -1:
  319. url_list += [[i['href'], i['aria-label']]]
  320. except:
  321. pass
  322. return url_list
  323. def data_select_insert(db, table_name, table_col, data):
  324. tmp = []
  325. for name_ in table_col:
  326. if name_ == 'crawler_date':
  327. continue
  328. if name_ == 'lon' or name_ == 'lat':
  329. tmp += [float(data[name_])]
  330. else:
  331. tmp += [data[name_]]
  332. tmp += [datetime.today().strftime("%Y/%m/%d %H:%M")]
  333. insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
  334. .format(table_name, str(tuple(table_col)).replace('\'',''), tuple(tmp))
  335. DA.mysql_insert_data(db, insert_sql)
  336. def time_click(driver):
  337. shop_soup_tmp = BeautifulSoup(driver.page_source, 'html.parser')
  338. status = ''
  339. try:
  340. if len(shop_soup_tmp.select("span[aria-label='顯示本週營業時間']")) != 0:
  341. time_css = "span[aria-label='顯示本週營業時間']"
  342. element = driver.find_element_by_css_selector(time_css)
  343. driver.implicitly_wait(10)
  344. ActionChains(driver).move_to_element(element).click(element).perform()
  345. status = '正常'
  346. elif len(shop_soup_tmp.select("img[aria-label='通知']")) != 0:
  347. status = shop_soup_tmp.find('span',class_='LJKBpe-Tswv1b-text aSftqf').text
  348. # status = '永久停業' or '暫時關閉'
  349. elif len(shop_soup_tmp.select('button[aria-label*="查看更詳細的營業時間"]')) != 0:
  350. status = 'error'
  351. return status
  352. except:
  353. return ''
  354. def get_new_keyword(db):
  355. result = db.query('select distinct(keyword) from shop_item_list order by keyword')
  356. result = pd.DataFrame([i for i in result])
  357. progress = db.query('select distinct(kw) from progress_list2')
  358. progress = pd.DataFrame([i for i in progress])
  359. if len(progress) != 0:
  360. keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0].values[0]
  361. else:
  362. keyword = result.iloc[0].values[0]
  363. return keyword
  364. def get_not_cralwer_url(keyword):
  365. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  366. table = db['shop_item_list3']
  367. url_list = list(table.find(keyword=keyword))
  368. shop_item = [i['item_url'] for i in db.query('SELECT item_url FROM shop_list2 where keyword="{}"'.format(keyword))]
  369. error_item = [i['item_url'] for i in db.query('SELECT item_url FROM error_list2 where keyword="{}"'.format(keyword))]
  370. url_pd = pd.DataFrame(url_list, columns=url_list[0].keys())
  371. # url_pd['item_url_length'] = url_pd.item_url.apply(lambda x: len(x))
  372. # url_pd = url_pd[(url_pd['item_url_length']!=1000) & (url_pd['item_url_length']!=600)]
  373. url_pd = url_pd[~url_pd['item_url'].isin(shop_item)]
  374. url_pd = url_pd[~url_pd['item_url'].isin(error_item)]
  375. print('have {} URL list'.format(len(url_pd)))
  376. # url_list = pd.read_csv('result/shop_item_list_20211210.csv', index_col=0)
  377. return url_pd
  378. def serive_create_linux(profilepath):
  379. option = webdriver.ChromeOptions()
  380. option.add_argument('--headless')
  381. option.add_argument('--no-sandbox')
  382. option.add_argument('--disable-web-security')
  383. option.add_argument('--allow-running-insecure-content')
  384. option.add_argument('--incognito')
  385. option.add_argument(
  386. 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0')
  387. # option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data")
  388. option.add_argument(
  389. "--user-data-dir=/home/noodlesloves/.config/google-chrome/")
  390. option.add_argument("profile-directory="+profilepath)
  391. driver = webdriver.Chrome('utility/chromedriver', options=option)
  392. # driver = webdriver.Chrome(executable_path='/usr/bin/chromedriver', chrome_options=option,
  393. # service_args=['--verbose', '--log-path=/tmp/chromedriver.log'])
  394. executor_url = driver.command_executor._url
  395. session_id = driver.session_id
  396. print(session_id)
  397. print(executor_url)
  398. return driver
  399. def find_lon_lat(driver):
  400. e = driver.find_element_by_css_selector("#scene > div.widget-scene > canvas")
  401. size = e.size
  402. total_height = size['height']
  403. total_width = size['width']
  404. size2 = driver.find_element_by_css_selector("#pane > div.Yr7JMd-pane").size
  405. left_width = size2['width']
  406. print(total_height, total_width, left_width)
  407. x = (total_width - left_width) / 2 + left_width
  408. y = total_height / 2
  409. e = driver.find_element_by_css_selector("#pane > div.Yr7JMd-pane")
  410. action = webdriver.common.action_chains.ActionChains(driver)
  411. action.move_to_element_with_offset(e, x, y)
  412. action.context_click()
  413. action.perform()
  414. time.sleep(0.5)
  415. element = driver.find_element_by_css_selector('#action-menu > ul > li:nth-child(1)')
  416. lat, lon = element.text.split(',')
  417. return float(lat), float(lon)
  418. def get_unique_id(driver):
  419. element = driver.find_element(By.CSS_SELECTOR, "button[data-value='分享']")
  420. driver.implicitly_wait(5)
  421. ActionChains(driver).move_to_element(element).click(element).perform()
  422. time.sleep(0.5)
  423. for i in range(5):
  424. ele = driver.find_element(By.CSS_SELECTOR, "input")
  425. short_url = ele.get_attribute('value')
  426. unique_id = short_url.split('/')[-1]
  427. if len(unique_id) != 0:
  428. break
  429. time.sleep(0.5)
  430. element = driver.find_element(By.CSS_SELECTOR, "button[aria-label='關閉']")
  431. driver.implicitly_wait(5)
  432. ActionChains(driver).move_to_element(element).click(element).perform()
  433. return unique_id
  434. def page_down_(driver, xpath_css, time_):
  435. elmts = driver.find_elements_by_xpath(xpath_css)
  436. print(elmts)
  437. if len(elmts)>1:
  438. elmt=elmts[1]
  439. else:
  440. elmt=elmts[0]
  441. actions = ActionChains(driver)
  442. actions.move_to_element(elmt).click().perform()
  443. for i in range(time_):
  444. try:
  445. actions = ActionChains(driver)
  446. actions.send_keys(Keys.PAGE_DOWN).perform()
  447. except:
  448. traceback.print_exc()
  449. time.sleep(0.5)
  450. def main():
  451. db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
  452. db2 = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  453. table2 = db2['swire_store_list']
  454. # keyword = '麻辣火鍋'
  455. # if len(sys.argv) >1:
  456. # keyword=sys.argv[1]
  457. # port=4444
  458. # if len(sys.argv) >2:
  459. # port=int(sys.argv[2])
  460. if len(sys.argv) > 1 :
  461. port=int(sys.argv[1])
  462. print('restart docker p{}'.format(port))
  463. os.system('sudo docker container restart p'+str(port))
  464. time.sleep(8)
  465. for i in range(10):
  466. result = db2.query('select * from swire_store_list where check_ is null ORDER BY RAND() limit 500')
  467. url_pd = pd.DataFrame([dict(i) for i in result])
  468. url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
  469. # keyword = get_new_keyword(db2)
  470. # table2.insert({'kw':keyword,'num':0})
  471. # url_pd = get_not_cralwer_url(keyword)
  472. # print('drvier start {}...'.format(keyword))
  473. driver = brower_start(port)
  474. #driver = serive_create('Profile 6')
  475. #profilepath = 'Profile 1'
  476. #driver = serive_create_linux(profilepath)
  477. for key, row in url_pd.iterrows():
  478. try:
  479. name = row['name']
  480. item_url = row['item_url']
  481. print(key, name, ': ' ,item_url)
  482. print('start...')
  483. driver.get(item_url)
  484. page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu']", 3)
  485. # lat, lon = find_lon_lat(driver)
  486. # unique_id = get_unique_id(driver)
  487. time_status = time_click(driver)
  488. time.sleep(0.5)
  489. shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  490. output = {
  491. # 'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text),
  492. 'name': name,
  493. 'fid': row['fid']
  494. }
  495. print(output['name'])
  496. print('get_shop_info')
  497. output = get_shop_info(driver, output, shop_soup)
  498. print('get_intro_info')
  499. if len(shop_soup.select("div[aria-label='{}簡介']".format(output['name']))) != 0:
  500. output = get_intro_info(driver, output)
  501. else:
  502. for key in intro_list:
  503. output[intro_list[key][0]] = '[]'
  504. print('get_time_list')
  505. if time_status == '正常':
  506. output = get_time_list(shop_soup, output)
  507. else:
  508. output['open_now'] = False
  509. output['periods'] = ''
  510. output['weekday_text'] = ''
  511. print('user_ratings_total')
  512. if output['user_ratings_total'] == '':
  513. output['reviews'] = ''
  514. else:
  515. output = get_reviews(driver, output)
  516. print('find_big_photo')
  517. output = find_big_photo(output, driver)
  518. output_name = output['name'].replace('(','').replace(')', '')
  519. query_name = '{}+{}'.format(output_name, output['addr'])
  520. query_name = query_name.replace(' ','')
  521. output['item_url'] = item_url
  522. output['keyword'] = row['keyword']
  523. output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
  524. data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
  525. table2.upsert({'place_id':row['place_id'],'check_':1},['place_id'])
  526. except Exception as e:
  527. table3 = db2['error_list2']
  528. table3.insert({'num':row['name'],'keyword':row['keyword'],'item_url':row['item_url'],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
  529. print(e)
  530. # error_table_col = ['name', 'keyword', 'item_url', 'crawler_date']
  531. # db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
  532. # data_select_insert(db, 'error_list2', error_table_col, row)
  533. time.sleep(1)
  534. if __name__ == '__main__':
  535. main()