run.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.common.exceptions import NoSuchElementException
  8. from selenium.webdriver.common.by import By
  9. from bs4 import BeautifulSoup
  10. from utility import database_access as DA
  11. from utility.parseutils import *
  12. from utility.connect import *
  13. from datetime import datetime
  14. import traceback
  15. import dataset
  16. import pandas as pd
  17. import time
  18. import json
  19. import re
  20. import sys
  21. import os
  22. # import pyautogui as pag
  23. def serive_create(profilepath):
  24. option = webdriver.ChromeOptions()
  25. option.add_argument('--disable-web-security')
  26. option.add_argument('--allow-running-insecure-content')
  27. option.add_argument("--user-data-dir=C:\\Users\\user\\AppData\\Local\\Google\\Chrome\\User Data")
  28. option.add_argument("profile-directory="+profilepath)
  29. driver = webdriver.Chrome('./utility/chromedriver_win32/chromedriver', options=option)
  30. executor_url = driver.command_executor._url
  31. session_id = driver.session_id
  32. print (session_id)
  33. print (executor_url)
  34. time.sleep(3)
  35. return driver
  36. def brower_start(port):
  37. options = webdriver.ChromeOptions()
  38. # browser = webdriver.Chrome(options=options)
  39. browser = webdriver.Chrome(options=options)
  40. # browser = webdriver.Remote(
  41. # command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  42. # # command_executor='http://192.53.174.202:'+str(port)+'/wd/hub',
  43. # desired_capabilities=options.to_capabilities()
  44. # )
  45. return browser
  46. def keyin_keyword(driver, keyword):
  47. button = driver.find_element_by_id("searchbox")
  48. driver.implicitly_wait(30)
  49. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  50. time.sleep(3)
  51. # element = driver.find_element_by_class_name("V0h1Ob-haAclf")
  52. # driver.implicitly_wait(30)
  53. # ActionChains(driver).move_to_element(element).click(element).perform()
  54. def open_time(driver):
  55. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
  56. if element.text.find('預訂') == -1:
  57. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
  58. driver.implicitly_wait(10)
  59. ActionChains(driver).move_to_element(element).click(element).perform()
  60. return 1
  61. else:
  62. return 0
  63. def get_shop_info(driver, output, shop_soup):
  64. # current_url_split = driver.current_url.split('@')[1].split(',')
  65. # output['lon'] = current_url_split[1]
  66. # output['lat'] = current_url_split[0]
  67. location = shop_soup.find('button',{'data-item-id':'oloc'})['aria-label'].split(' ')
  68. output['city'] = location[-1]
  69. output['area'] = location[-2]
  70. try:
  71. output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].replace('地址:', '')
  72. except:
  73. output['addr'] = ''
  74. try:
  75. output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
  76. except:
  77. output['tel'] = ''
  78. print(output['addr'], ', ' ,output['tel'])
  79. for key in element_list:
  80. try:
  81. element = element_list[key]
  82. if len(element) == 3:
  83. value = shop_soup.find(element[0],element[1])[element[2]]
  84. else:
  85. tmp_value = shop_soup.find(element[0],element[1])
  86. if tmp_value:
  87. value = tmp_value.text
  88. else:
  89. value = ''
  90. output[key] = value_check(key, value)
  91. except:
  92. output[key] = ''
  93. return output
  94. def get_intro_info(driver, output):
  95. # element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]')
  96. try:
  97. element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}簡介']".format(output['name']))
  98. driver.implicitly_wait(5)
  99. ActionChains(driver).move_to_element(element).click(element).perform()
  100. # pageSource = driver.page_source
  101. # fileToWrite = open("page_source.html", "w")
  102. # fileToWrite.write(pageSource)
  103. # fileToWrite.close()
  104. page_down_(driver, '//*[@id="pane"]/div/div[1]', 3)
  105. intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
  106. for key in intro_list:
  107. elements = intro_soup.find('div',{'aria-label':key})
  108. if elements:
  109. element = elements.find_all('li',{'class':'LQjNnc-p83tee-JNdkSc-ibnC6b'})
  110. count = 0
  111. tmp = []
  112. for ele in element:
  113. # if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/2x/check_black_18dp.png"}):
  114. if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/1x/check_black_18dp.png"}):
  115. tmp += [{
  116. 'id':count,
  117. intro_list[key][1]: blank_check(ele.text)
  118. }]
  119. count += 1
  120. print(str(tmp))
  121. output[intro_list[key][0]] = str(tmp)
  122. else:
  123. output[intro_list[key][0]] = '[]'
  124. driver.back()
  125. return output
  126. except:
  127. for key in intro_list:
  128. output[intro_list[key][0]] = '[]'
  129. return output
  130. def get_time_list(shop_soup, output):
  131. periods = []
  132. weekday_text = []
  133. open_now = blank_check(shop_soup.find('span', {'class':'LJKBpe-Tswv1b-hour-text'}).text.split('\xa0')[0])
  134. if open_now == '永久停業' or open_now == '暫時關閉':
  135. output['open_now'] = 'False'
  136. else:
  137. output['open_now'] = 'True'
  138. for tr_ in shop_soup.find_all('tr'):
  139. if tr_.find('div').text.replace(' ','') != '':
  140. week = tr_.find('div').text
  141. time_list = [blank_check(i.text) for i in tr_.find_all('li')]
  142. for time_ in time_list:
  143. if time_ == '24 小時營業':
  144. periods += [{
  145. "open":{
  146. "day": week_list[week],
  147. "time": 0000
  148. },
  149. "close":{
  150. "day": week_list[week],
  151. "time": ''
  152. }
  153. }]
  154. elif time_ == '休息':
  155. periods += [{
  156. "open":{
  157. "day": week_list[week],
  158. "time": ''
  159. },
  160. "close":{
  161. "day": week_list[week],
  162. "time": ''
  163. }
  164. }]
  165. else:
  166. start, end = time_.split('–')
  167. end_hour, end_min = end.split(':')
  168. start_hour, start_min = start.split(':')
  169. if end_hour < start_hour:
  170. end_day = week_list[week] + 1
  171. else:
  172. end_day = week_list[week]
  173. periods += [{
  174. "open":{
  175. "day": week_list[week],
  176. "time": start.replace(':','')
  177. },
  178. "close":{
  179. "day": end_day,
  180. "time": end.replace(':','')
  181. }
  182. }]
  183. weekday_text += ["{}: {}".format(week, ', '.join(time_list))]
  184. output['periods'] = str(periods)
  185. output['weekday_text'] = str(weekday_text)
  186. return output
  187. def get_reviews(driver, output):
  188. wait = WebDriverWait(driver, 30)
  189. more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
  190. wait.until(
  191. EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
  192. )
  193. element = driver.find_element_by_css_selector(more_reviews_css)
  194. driver.implicitly_wait(10)
  195. ActionChains(driver).move_to_element(element).click(element).perform()
  196. time.sleep(0.5)
  197. # page_down_(driver, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]', 5)
  198. page_down_(driver, '//div[@class="PPCwl"]',5)
  199. comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
  200. if comment_soup.find_all('div',class_='ODSEW-ShBeI-xJzy8c-bF1uUb') != 0:
  201. all_photo = driver.find_elements_by_class_name('ODSEW-ShBeI-xJzy8c-bF1uUb')
  202. for ap in all_photo:
  203. ap.click()
  204. if comment_soup.select('button[aria-label="顯示更多"]') != 0:
  205. all_review = driver.find_elements_by_css_selector('button[aria-label="顯示更多"]')
  206. for ap in all_review:
  207. ap.click()
  208. comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
  209. count = 0
  210. reviews = []
  211. for comment in comment_soup.find_all('div',{'class':'ODSEW-ShBeI'}):
  212. comment_a_tag = comment.find_all('a')
  213. author_name = blank_check(comment_a_tag[1].find('div', class_= 'ODSEW-ShBeI-title').text)
  214. profile_photo_url = comment_a_tag[0].find('img')['src']
  215. rating = blank_check(comment.find('span',{'role':'img'})['aria-label'].replace('顆星', ''))
  216. text = comment.find('div', class_='ODSEW-ShBeI-ShBeI-content').text
  217. created_at = comment.find('span', class_='ODSEW-ShBeI-RgZmSc-date').text
  218. photos = []
  219. c = 0
  220. for i in comment.find_all('button', class_='ODSEW-ShBeI-xJzy8c'):
  221. path = i['style'].split(';')[0].split('url')[1].replace('\"','').replace('(','').replace(')','')
  222. photos += [path]
  223. c += 1
  224. reviews += [{
  225. 'id': comment.find('a')['href'].split('/')[5],
  226. 'author_name': author_name,
  227. 'profile_photo_url': profile_photo_url,
  228. 'rating': int(rating),
  229. 'text': text,
  230. 'created_at': created_at,
  231. 'photos': photos
  232. }]
  233. count += 1
  234. output['reviews'] = str(reviews)
  235. driver.back()
  236. return output
  237. # def get_photo(output, shop_soup):
  238. # shop_photo = {}
  239. # for i in shop_soup.find('div',{'aria-label':'{}的相片'.format(output['name'])}).find_all('button'):
  240. # try:
  241. # if i['aria-label'] == '街景服務和 360 度相片' or i['aria-label'] == '影片':
  242. # continue
  243. # shop_photo[i['aria-label']] = i.find('img')['src']
  244. # except:
  245. # pass
  246. # output['shop_photo'] = shop_photo
  247. # return output
  248. def find_photo_list(driver):
  249. time.sleep(0.5)
  250. wait = WebDriverWait(driver, 60)
  251. wait.until(
  252. EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a'))
  253. )
  254. page_down_(driver,'//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a' , 10)
  255. photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
  256. photo_url = []
  257. count = 0
  258. for i in photo_soup.find_all('a', class_='mWq4Rd-eEDwDf'):
  259. if count > 5: break
  260. a_url = i.find('div', class_='mWq4Rd-HiaYvf-CNusmb-gevUs loaded')
  261. if a_url:
  262. if a_url.find('width') != -1:
  263. sentence = a_url['style']
  264. photo = re.search(r'https:(.*)\"', sentence)
  265. photo_url += [photo.group(0).replace('\"','')]
  266. count += 1
  267. return photo_url
  268. def find_big_photo(output, driver):
  269. # element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
  270. wait = WebDriverWait(driver, 60)
  271. wait.until(
  272. EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[1]/div[1]/button'))
  273. )
  274. element = driver.find_element(By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[1]/div[1]/button')
  275. ActionChains(driver).move_to_element(element).click(element).perform()
  276. output['shop_photo'] = '[]'
  277. output['menu_photo'] = '[]'
  278. photo_map = {
  279. '全部': 'shop_photo',
  280. '菜單': 'menu_photo'
  281. }
  282. driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='1']")
  283. photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
  284. tab_dict = {}
  285. for tab_index in [0, 1, 2]:
  286. selector = photo_soup.select("button[data-tab-index='{}']".format(tab_index))
  287. if len(selector) != 0:
  288. photo_name = selector[0].text
  289. if photo_name == '菜單':
  290. tab_dict[photo_name] = tab_index
  291. elif photo_name == '全部':
  292. tab_dict[photo_name] = tab_index
  293. print(tab_dict)
  294. for tab_ in tab_dict:
  295. tab_index = tab_dict[tab_]
  296. print(tab_index)
  297. wait = WebDriverWait(driver, 60)
  298. wait.until(
  299. EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
  300. )
  301. element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
  302. ActionChains(driver).move_to_element(element).click(element).perform()
  303. photo_list = find_photo_list(driver)
  304. output[photo_map[tab_]] = str(photo_list)
  305. return output
  306. def get_url_list(driver):
  307. # wait = WebDriverWait(driver, 10)
  308. # wait.until(
  309. # EC.element_to_be_clickable((By.XPATH, '//*[@id="sGi9mc-m5SR9c-bottom-pane"]/div/div[1]/div/div/div/div[1]/div[2]/div[2]'))
  310. # )
  311. # driver.back()
  312. time.sleep(2)
  313. for i in range(5, 43, 2):
  314. driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
  315. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  316. url_list = []
  317. for i in url_soup.find_all('a'):
  318. try:
  319. if i['href'].find('maps/place') != -1:
  320. url_list += [[i['href'], i['aria-label']]]
  321. except:
  322. pass
  323. return url_list
  324. def data_select_insert(db, table_name, table_col, data):
  325. tmp = []
  326. for name_ in table_col:
  327. if name_ == 'crawler_date':
  328. continue
  329. if name_ == 'lon' or name_ == 'lat':
  330. tmp += [float(data[name_])]
  331. else:
  332. tmp += [data[name_]]
  333. tmp += [datetime.today().strftime("%Y/%m/%d %H:%M")]
  334. insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
  335. .format(table_name, str(tuple(table_col)).replace('\'',''), tuple(tmp))
  336. DA.mysql_insert_data(db, insert_sql)
  337. def time_click(driver):
  338. shop_soup_tmp = BeautifulSoup(driver.page_source, 'html.parser')
  339. status = ''
  340. try:
  341. if len(shop_soup_tmp.select("span[aria-label='顯示本週營業時間']")) != 0:
  342. time_css = "span[aria-label='顯示本週營業時間']"
  343. element = driver.find_element_by_css_selector(time_css)
  344. driver.implicitly_wait(10)
  345. ActionChains(driver).move_to_element(element).click(element).perform()
  346. status = '正常'
  347. elif len(shop_soup_tmp.select("img[aria-label='通知']")) != 0:
  348. status = shop_soup_tmp.find('span',class_='LJKBpe-Tswv1b-text aSftqf').text
  349. # status = '永久停業' or '暫時關閉'
  350. elif len(shop_soup_tmp.select('button[aria-label*="查看更詳細的營業時間"]')) != 0:
  351. status = 'error'
  352. return status
  353. except:
  354. return ''
  355. def get_new_keyword(db):
  356. result = db.query('select distinct(keyword) from shop_item_list order by keyword')
  357. result = pd.DataFrame([i for i in result])
  358. progress = db.query('select distinct(kw) from progress_list2')
  359. progress = pd.DataFrame([i for i in progress])
  360. if len(progress) != 0:
  361. keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0].values[0]
  362. else:
  363. keyword = result.iloc[0].values[0]
  364. return keyword
  365. def get_not_cralwer_url(keyword):
  366. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  367. table = db['shop_item_list3']
  368. url_list = list(table.find(keyword=keyword))
  369. shop_item = [i['item_url'] for i in db.query('SELECT item_url FROM shop_list2 where keyword="{}"'.format(keyword))]
  370. error_item = [i['item_url'] for i in db.query('SELECT item_url FROM error_list2 where keyword="{}"'.format(keyword))]
  371. url_pd = pd.DataFrame(url_list, columns=url_list[0].keys())
  372. # url_pd['item_url_length'] = url_pd.item_url.apply(lambda x: len(x))
  373. # url_pd = url_pd[(url_pd['item_url_length']!=1000) & (url_pd['item_url_length']!=600)]
  374. url_pd = url_pd[~url_pd['item_url'].isin(shop_item)]
  375. url_pd = url_pd[~url_pd['item_url'].isin(error_item)]
  376. print('have {} URL list'.format(len(url_pd)))
  377. # url_list = pd.read_csv('result/shop_item_list_20211210.csv', index_col=0)
  378. return url_pd
  379. def serive_create_linux(profilepath):
  380. option = webdriver.ChromeOptions()
  381. option.add_argument('--headless')
  382. option.add_argument('--no-sandbox')
  383. option.add_argument('--disable-web-security')
  384. option.add_argument('--allow-running-insecure-content')
  385. option.add_argument('--incognito')
  386. option.add_argument(
  387. 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0')
  388. # option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data")
  389. option.add_argument(
  390. "--user-data-dir=/home/noodlesloves/.config/google-chrome/")
  391. option.add_argument("profile-directory="+profilepath)
  392. driver = webdriver.Chrome('utility/chromedriver', options=option)
  393. # driver = webdriver.Chrome(executable_path='/usr/bin/chromedriver', chrome_options=option,
  394. # service_args=['--verbose', '--log-path=/tmp/chromedriver.log'])
  395. executor_url = driver.command_executor._url
  396. session_id = driver.session_id
  397. print(session_id)
  398. print(executor_url)
  399. return driver
  400. def find_lon_lat(driver):
  401. e = driver.find_element_by_css_selector("#scene > div.widget-scene > canvas")
  402. size = e.size
  403. total_height = size['height']
  404. total_width = size['width']
  405. size2 = driver.find_element_by_css_selector("#pane > div.Yr7JMd-pane").size
  406. left_width = size2['width']
  407. print(total_height, total_width, left_width)
  408. x = (total_width - left_width) / 2 + left_width
  409. y = total_height / 2
  410. e = driver.find_element_by_css_selector("#pane > div.Yr7JMd-pane")
  411. action = webdriver.common.action_chains.ActionChains(driver)
  412. action.move_to_element_with_offset(e, x, y)
  413. action.context_click()
  414. action.perform()
  415. time.sleep(0.5)
  416. element = driver.find_element_by_css_selector('#action-menu > ul > li:nth-child(1)')
  417. lat, lon = element.text.split(',')
  418. return float(lat), float(lon)
  419. def get_unique_id(driver):
  420. element = driver.find_element(By.CSS_SELECTOR, "button[data-value='分享']")
  421. driver.implicitly_wait(5)
  422. ActionChains(driver).move_to_element(element).click(element).perform()
  423. time.sleep(0.5)
  424. for i in range(5):
  425. ele = driver.find_element(By.CSS_SELECTOR, "input")
  426. short_url = ele.get_attribute('value')
  427. unique_id = short_url.split('/')[-1]
  428. if len(unique_id) != 0:
  429. break
  430. time.sleep(0.5)
  431. element = driver.find_element(By.CSS_SELECTOR, "button[aria-label='關閉']")
  432. driver.implicitly_wait(5)
  433. ActionChains(driver).move_to_element(element).click(element).perform()
  434. return unique_id
  435. def page_down_(driver, xpath_css, time_):
  436. elmts = driver.find_elements_by_xpath(xpath_css)
  437. print(elmts)
  438. if len(elmts)>1:
  439. elmt=elmts[1]
  440. else:
  441. elmt=elmts[0]
  442. actions = ActionChains(driver)
  443. actions.move_to_element(elmt).click().perform()
  444. for i in range(time_):
  445. try:
  446. actions = ActionChains(driver)
  447. actions.send_keys(Keys.PAGE_DOWN).perform()
  448. except:
  449. traceback.print_exc()
  450. time.sleep(0.5)
  451. def main():
  452. db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
  453. db2 = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  454. table2 = db2['swire_store_list']
  455. # keyword = '麻辣火鍋'
  456. # if len(sys.argv) >1:
  457. # keyword=sys.argv[1]
  458. # port=4444
  459. # if len(sys.argv) >2:
  460. # port=int(sys.argv[2])
  461. if len(sys.argv) > 1 :
  462. port=int(sys.argv[1])
  463. print('restart docker p{}'.format(port))
  464. os.system('sudo docker container restart p'+str(port))
  465. time.sleep(8)
  466. for i in range(10):
  467. result = db2.query('select * from swire_store_list where check_ is null ORDER BY RAND() limit 500')
  468. url_pd = pd.DataFrame([dict(i) for i in result])
  469. url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
  470. # keyword = get_new_keyword(db2)
  471. # table2.insert({'kw':keyword,'num':0})
  472. # url_pd = get_not_cralwer_url(keyword)
  473. # print('drvier start {}...'.format(keyword))
  474. driver = brower_start(port)
  475. #driver = serive_create('Profile 6')
  476. #profilepath = 'Profile 1'
  477. #driver = serive_create_linux(profilepath)
  478. for key, row in url_pd.iterrows():
  479. try:
  480. name = row['name']
  481. item_url = row['item_url']
  482. print(key, name, ': ' ,item_url)
  483. print('start...')
  484. driver.get(item_url)
  485. page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu']", 3)
  486. # lat, lon = find_lon_lat(driver)
  487. # unique_id = get_unique_id(driver)
  488. time_status = time_click(driver)
  489. time.sleep(0.5)
  490. shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  491. output = {
  492. # 'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text),
  493. 'name': name,
  494. 'fid': row['fid']
  495. }
  496. print(output['name'])
  497. print('get_shop_info')
  498. output = get_shop_info(driver, output, shop_soup)
  499. print('get_intro_info')
  500. if len(shop_soup.select("div[aria-label='{}簡介']".format(output['name']))) != 0:
  501. output = get_intro_info(driver, output)
  502. else:
  503. for key in intro_list:
  504. output[intro_list[key][0]] = '[]'
  505. print('get_time_list')
  506. if time_status == '正常':
  507. output = get_time_list(shop_soup, output)
  508. else:
  509. output['open_now'] = False
  510. output['periods'] = ''
  511. output['weekday_text'] = ''
  512. print('user_ratings_total')
  513. if output['user_ratings_total'] == '':
  514. output['reviews'] = ''
  515. else:
  516. output = get_reviews(driver, output)
  517. print('find_big_photo')
  518. output = find_big_photo(output, driver)
  519. output_name = output['name'].replace('(','').replace(')', '')
  520. query_name = '{}+{}'.format(output_name, output['addr'])
  521. query_name = query_name.replace(' ','')
  522. output['item_url'] = item_url
  523. output['keyword'] = row['keyword']
  524. output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
  525. data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
  526. table2.upsert({'place_id':row['place_id'],'check_':1},['place_id'])
  527. except Exception as e:
  528. table3 = db2['error_list2']
  529. table3.insert({'num':row['name'],'keyword':row['keyword'],'item_url':row['item_url'],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
  530. print(e)
  531. # error_table_col = ['name', 'keyword', 'item_url', 'crawler_date']
  532. # db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
  533. # data_select_insert(db, 'error_list2', error_table_col, row)
  534. time.sleep(1)
  535. if __name__ == '__main__':
  536. main()