run3.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.common.exceptions import NoSuchElementException
  8. from selenium.webdriver.common.by import By
  9. from bs4 import BeautifulSoup
  10. from utility import database_access as DA
  11. from utility.parseutils import *
  12. from utility.connect import *
  13. from datetime import datetime
  14. import traceback
  15. import dataset
  16. import pandas as pd
  17. import time
  18. import json
  19. import re
  20. import sys
  21. import os
  22. # import pyautogui as pag
  23. def serive_create(profilepath):
  24. option = webdriver.ChromeOptions()
  25. option.add_argument('--disable-web-security')
  26. option.add_argument('--allow-running-insecure-content')
  27. option.add_argument("--user-data-dir=C:\\Users\\user\\AppData\\Local\\Google\\Chrome\\User Data")
  28. option.add_argument("profile-directory="+profilepath)
  29. driver = webdriver.Chrome('./utility/chromedriver_win32/chromedriver', options=option)
  30. executor_url = driver.command_executor._url
  31. session_id = driver.session_id
  32. print (session_id)
  33. print (executor_url)
  34. time.sleep(3)
  35. return driver
  36. def brower_start(port):
  37. options = webdriver.ChromeOptions()
  38. # browser = webdriver.Chrome(options=options)
  39. options.add_argument('--ignore-certificate-errors')
  40. options.add_argument("--no-sandbox")
  41. options.add_argument("--headless")
  42. options.add_argument("--disable-gpu")
  43. options.add_argument("--disable-dev-shm-usage")
  44. browser = webdriver.Chrome(options=options)
  45. browser.set_window_size(1400,1000)
  46. # browser = webdriver.Remote(
  47. # command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  48. # # command_executor='http://192.53.174.202:'+str(port)+'/wd/hub',
  49. # desired_capabilities=options.to_capabilities()
  50. # )
  51. return browser
  52. def keyin_keyword(driver, keyword):
  53. button = driver.find_element_by_id("searchbox")
  54. driver.implicitly_wait(30)
  55. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  56. time.sleep(3)
  57. # element = driver.find_element_by_class_name("V0h1Ob-haAclf")
  58. # driver.implicitly_wait(30)
  59. # ActionChains(driver).move_to_element(element).click(element).perform()
  60. def open_time(driver):
  61. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
  62. if element.text.find('預訂') == -1:
  63. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
  64. driver.implicitly_wait(10)
  65. ActionChains(driver).move_to_element(element).click(element).perform()
  66. return 1
  67. else:
  68. return 0
  69. def get_shop_info(driver, output, shop_soup):
  70. # current_url_split = driver.current_url.split('@')[1].split(',')
  71. # output['lon'] = current_url_split[1]
  72. # output['lat'] = current_url_split[0]
  73. location = shop_soup.find('button',{'data-item-id':'oloc'})['aria-label'].split(' ')
  74. output['city'] = location[-1]
  75. output['area'] = location[-2]
  76. try:
  77. output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].replace('地址:', '')
  78. except:
  79. output['addr'] = ''
  80. try:
  81. output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
  82. except:
  83. output['tel'] = ''
  84. print(output['addr'], ', ' ,output['tel'])
  85. for key in element_list:
  86. try:
  87. element = element_list[key]
  88. if len(element) == 3:
  89. value = shop_soup.find(element[0],element[1])[element[2]]
  90. else:
  91. tmp_value = shop_soup.find(element[0],element[1])
  92. if tmp_value:
  93. value = tmp_value.text
  94. else:
  95. value = ''
  96. output[key] = value_check(key, value)
  97. except:
  98. output[key] = ''
  99. return output
  100. def get_intro_info(driver, output):
  101. # element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]')
  102. try:
  103. element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}簡介']".format(output['name']))
  104. driver.implicitly_wait(5)
  105. ActionChains(driver).move_to_element(element).click(element).perform()
  106. # pageSource = driver.page_source
  107. # fileToWrite = open("page_source.html", "w")
  108. # fileToWrite.write(pageSource)
  109. # fileToWrite.close()
  110. page_down_(driver, '//*[@id="pane"]/div/div[1]', 3)
  111. intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
  112. for key in intro_list:
  113. elements = intro_soup.find('div',{'aria-label':key})
  114. if elements:
  115. element = elements.find_all('li',{'class':'LQjNnc-p83tee-JNdkSc-ibnC6b'})
  116. count = 0
  117. tmp = []
  118. for ele in element:
  119. # if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/2x/check_black_18dp.png"}):
  120. if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/1x/check_black_18dp.png"}):
  121. tmp += [{
  122. 'id':count,
  123. intro_list[key][1]: blank_check(ele.text)
  124. }]
  125. count += 1
  126. print(str(tmp))
  127. output[intro_list[key][0]] = str(tmp)
  128. else:
  129. output[intro_list[key][0]] = '[]'
  130. driver.back()
  131. return output
  132. except:
  133. for key in intro_list:
  134. output[intro_list[key][0]] = '[]'
  135. return output
  136. def get_time_list(shop_soup, output):
  137. periods = []
  138. weekday_text = []
  139. open_now = blank_check(shop_soup.find('span', {'class':'LJKBpe-Tswv1b-hour-text'}).text.split('\xa0')[0])
  140. if open_now == '永久停業' or open_now == '暫時關閉':
  141. output['open_now'] = 'False'
  142. else:
  143. output['open_now'] = 'True'
  144. for tr_ in shop_soup.find_all('tr'):
  145. if tr_.find('div').text.replace(' ','') != '':
  146. week = tr_.find('div').text
  147. time_list = [blank_check(i.text) for i in tr_.find_all('li')]
  148. for time_ in time_list:
  149. if time_ == '24 小時營業':
  150. periods += [{
  151. "open":{
  152. "day": week_list[week],
  153. "time": 0000
  154. },
  155. "close":{
  156. "day": week_list[week],
  157. "time": ''
  158. }
  159. }]
  160. elif time_ == '休息':
  161. periods += [{
  162. "open":{
  163. "day": week_list[week],
  164. "time": ''
  165. },
  166. "close":{
  167. "day": week_list[week],
  168. "time": ''
  169. }
  170. }]
  171. else:
  172. start, end = time_.split('–')
  173. end_hour, end_min = end.split(':')
  174. start_hour, start_min = start.split(':')
  175. if end_hour < start_hour:
  176. end_day = week_list[week] + 1
  177. else:
  178. end_day = week_list[week]
  179. periods += [{
  180. "open":{
  181. "day": week_list[week],
  182. "time": start.replace(':','')
  183. },
  184. "close":{
  185. "day": end_day,
  186. "time": end.replace(':','')
  187. }
  188. }]
  189. weekday_text += ["{}: {}".format(week, ', '.join(time_list))]
  190. output['periods'] = str(periods)
  191. output['weekday_text'] = str(weekday_text)
  192. return output
  193. def get_reviews(driver, output):
  194. wait = WebDriverWait(driver, 30)
  195. more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
  196. wait.until(
  197. EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
  198. )
  199. element = driver.find_element_by_css_selector(more_reviews_css)
  200. driver.implicitly_wait(10)
  201. ActionChains(driver).move_to_element(element).click(element).perform()
  202. time.sleep(0.5)
  203. # page_down_(driver, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]', 5)
  204. page_down_(driver, '//div[@class="PPCwl"]',5)
  205. comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
  206. if comment_soup.find_all('div',class_='ODSEW-ShBeI-xJzy8c-bF1uUb') != 0:
  207. all_photo = driver.find_elements_by_class_name('ODSEW-ShBeI-xJzy8c-bF1uUb')
  208. for ap in all_photo:
  209. ap.click()
  210. if comment_soup.select('button[aria-label="顯示更多"]') != 0:
  211. all_review = driver.find_elements_by_css_selector('button[aria-label="顯示更多"]')
  212. for ap in all_review:
  213. ap.click()
  214. comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
  215. count = 0
  216. reviews = []
  217. for comment in comment_soup.find_all('div',{'class':'ODSEW-ShBeI'}):
  218. comment_a_tag = comment.find_all('a')
  219. author_name = blank_check(comment_a_tag[1].find('div', class_= 'ODSEW-ShBeI-title').text)
  220. profile_photo_url = comment_a_tag[0].find('img')['src']
  221. rating = blank_check(comment.find('span',{'role':'img'})['aria-label'].replace('顆星', ''))
  222. text = comment.find('div', class_='ODSEW-ShBeI-ShBeI-content').text
  223. created_at = comment.find('span', class_='ODSEW-ShBeI-RgZmSc-date').text
  224. photos = []
  225. c = 0
  226. for i in comment.find_all('button', class_='ODSEW-ShBeI-xJzy8c'):
  227. path = i['style'].split(';')[0].split('url')[1].replace('\"','').replace('(','').replace(')','')
  228. photos += [path]
  229. c += 1
  230. reviews += [{
  231. 'id': comment.find('a')['href'].split('/')[5],
  232. 'author_name': author_name,
  233. 'profile_photo_url': profile_photo_url,
  234. 'rating': int(rating),
  235. 'text': text,
  236. 'created_at': created_at,
  237. 'photos': photos
  238. }]
  239. count += 1
  240. output['reviews'] = str(reviews)
  241. driver.back()
  242. return output
  243. # def get_photo(output, shop_soup):
  244. # shop_photo = {}
  245. # for i in shop_soup.find('div',{'aria-label':'{}的相片'.format(output['name'])}).find_all('button'):
  246. # try:
  247. # if i['aria-label'] == '街景服務和 360 度相片' or i['aria-label'] == '影片':
  248. # continue
  249. # shop_photo[i['aria-label']] = i.find('img')['src']
  250. # except:
  251. # pass
  252. # output['shop_photo'] = shop_photo
  253. # return output
  254. def find_photo_list(driver):
  255. time.sleep(0.5)
  256. wait = WebDriverWait(driver, 60)
  257. wait.until(
  258. EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a'))
  259. )
  260. page_down_(driver,'//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a' , 10)
  261. photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
  262. photo_url = []
  263. count = 0
  264. for i in photo_soup.find_all('a', class_='mWq4Rd-eEDwDf'):
  265. if count > 5: break
  266. a_url = i.find('div', class_='mWq4Rd-HiaYvf-CNusmb-gevUs loaded')
  267. if a_url:
  268. if a_url.find('width') != -1:
  269. sentence = a_url['style']
  270. photo = re.search(r'https:(.*)\"', sentence)
  271. photo_url += [photo.group(0).replace('\"','')]
  272. count += 1
  273. return photo_url
  274. def find_big_photo(output, driver):
  275. # element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
  276. wait = WebDriverWait(driver, 60)
  277. wait.until(
  278. EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[1]/div[1]/button'))
  279. )
  280. element = driver.find_element(By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[1]/div[1]/button')
  281. ActionChains(driver).move_to_element(element).click(element).perform()
  282. output['shop_photo'] = '[]'
  283. output['menu_photo'] = '[]'
  284. photo_map = {
  285. '全部': 'shop_photo',
  286. '菜單': 'menu_photo'
  287. }
  288. driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='1']")
  289. photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
  290. tab_dict = {}
  291. for tab_index in [0, 1, 2]:
  292. selector = photo_soup.select("button[data-tab-index='{}']".format(tab_index))
  293. if len(selector) != 0:
  294. photo_name = selector[0].text
  295. if photo_name == '菜單':
  296. tab_dict[photo_name] = tab_index
  297. elif photo_name == '全部':
  298. tab_dict[photo_name] = tab_index
  299. print(tab_dict)
  300. for tab_ in tab_dict:
  301. tab_index = tab_dict[tab_]
  302. print(tab_index)
  303. wait = WebDriverWait(driver, 60)
  304. wait.until(
  305. EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
  306. )
  307. element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
  308. ActionChains(driver).move_to_element(element).click(element).perform()
  309. photo_list = find_photo_list(driver)
  310. output[photo_map[tab_]] = str(photo_list)
  311. return output
  312. def get_url_list(driver):
  313. # wait = WebDriverWait(driver, 10)
  314. # wait.until(
  315. # EC.element_to_be_clickable((By.XPATH, '//*[@id="sGi9mc-m5SR9c-bottom-pane"]/div/div[1]/div/div/div/div[1]/div[2]/div[2]'))
  316. # )
  317. # driver.back()
  318. time.sleep(2)
  319. for i in range(5, 43, 2):
  320. driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
  321. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  322. url_list = []
  323. for i in url_soup.find_all('a'):
  324. try:
  325. if i['href'].find('maps/place') != -1:
  326. url_list += [[i['href'], i['aria-label']]]
  327. except:
  328. pass
  329. return url_list
  330. def data_select_insert(db, table_name, table_col, data):
  331. tmp = []
  332. for name_ in table_col:
  333. if name_ == 'crawler_date':
  334. continue
  335. if name_ == 'lon' or name_ == 'lat':
  336. tmp += [float(data[name_])]
  337. else:
  338. tmp += [data[name_]]
  339. tmp += [datetime.today().strftime("%Y/%m/%d %H:%M")]
  340. insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
  341. .format(table_name, str(tuple(table_col)).replace('\'',''), tuple(tmp))
  342. DA.mysql_insert_data(db, insert_sql)
  343. def time_click(driver):
  344. shop_soup_tmp = BeautifulSoup(driver.page_source, 'html.parser')
  345. status = ''
  346. try:
  347. if len(shop_soup_tmp.select("span[aria-label='顯示本週營業時間']")) != 0:
  348. time_css = "span[aria-label='顯示本週營業時間']"
  349. element = driver.find_element_by_css_selector(time_css)
  350. driver.implicitly_wait(10)
  351. ActionChains(driver).move_to_element(element).click(element).perform()
  352. status = '正常'
  353. elif len(shop_soup_tmp.select("img[aria-label='通知']")) != 0:
  354. status = shop_soup_tmp.find('span',class_='LJKBpe-Tswv1b-text aSftqf').text
  355. # status = '永久停業' or '暫時關閉'
  356. elif len(shop_soup_tmp.select('button[aria-label*="查看更詳細的營業時間"]')) != 0:
  357. status = 'error'
  358. return status
  359. except:
  360. return ''
  361. def get_new_keyword(db):
  362. result = db.query('select distinct(keyword) from shop_item_list order by keyword')
  363. result = pd.DataFrame([i for i in result])
  364. progress = db.query('select distinct(kw) from progress_list2')
  365. progress = pd.DataFrame([i for i in progress])
  366. if len(progress) != 0:
  367. keyword = result[~result['keyword'].isin(progress.kw.to_list())].iloc[0].values[0]
  368. else:
  369. keyword = result.iloc[0].values[0]
  370. return keyword
  371. def get_not_cralwer_url(keyword):
  372. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  373. table = db['shop_item_list3']
  374. url_list = list(table.find(keyword=keyword))
  375. shop_item = [i['item_url'] for i in db.query('SELECT item_url FROM shop_list2 where keyword="{}"'.format(keyword))]
  376. error_item = [i['item_url'] for i in db.query('SELECT item_url FROM error_list2 where keyword="{}"'.format(keyword))]
  377. url_pd = pd.DataFrame(url_list, columns=url_list[0].keys())
  378. # url_pd['item_url_length'] = url_pd.item_url.apply(lambda x: len(x))
  379. # url_pd = url_pd[(url_pd['item_url_length']!=1000) & (url_pd['item_url_length']!=600)]
  380. url_pd = url_pd[~url_pd['item_url'].isin(shop_item)]
  381. url_pd = url_pd[~url_pd['item_url'].isin(error_item)]
  382. print('have {} URL list'.format(len(url_pd)))
  383. # url_list = pd.read_csv('result/shop_item_list_20211210.csv', index_col=0)
  384. return url_pd
  385. def serive_create_linux(profilepath):
  386. option = webdriver.ChromeOptions()
  387. option.add_argument('--headless')
  388. option.add_argument('--no-sandbox')
  389. option.add_argument('--disable-web-security')
  390. option.add_argument('--allow-running-insecure-content')
  391. option.add_argument('--incognito')
  392. option.add_argument(
  393. 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0')
  394. # option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data")
  395. option.add_argument(
  396. "--user-data-dir=/home/noodlesloves/.config/google-chrome/")
  397. option.add_argument("profile-directory="+profilepath)
  398. driver = webdriver.Chrome('utility/chromedriver', options=option)
  399. # driver = webdriver.Chrome(executable_path='/usr/bin/chromedriver', chrome_options=option,
  400. # service_args=['--verbose', '--log-path=/tmp/chromedriver.log'])
  401. executor_url = driver.command_executor._url
  402. session_id = driver.session_id
  403. print(session_id)
  404. print(executor_url)
  405. return driver
  406. def find_lon_lat(driver):
  407. e = driver.find_element_by_css_selector("#scene > div.widget-scene > canvas")
  408. size = e.size
  409. total_height = size['height']
  410. total_width = size['width']
  411. size2 = driver.find_element_by_css_selector("#pane > div.Yr7JMd-pane").size
  412. left_width = size2['width']
  413. print(total_height, total_width, left_width)
  414. x = (total_width - left_width) / 2 + left_width
  415. y = total_height / 2
  416. e = driver.find_element_by_css_selector("#pane > div.Yr7JMd-pane")
  417. action = webdriver.common.action_chains.ActionChains(driver)
  418. action.move_to_element_with_offset(e, x, y)
  419. action.context_click()
  420. action.perform()
  421. time.sleep(0.5)
  422. element = driver.find_element_by_css_selector('#action-menu > ul > li:nth-child(1)')
  423. lat, lon = element.text.split(',')
  424. return float(lat), float(lon)
  425. def get_unique_id(driver):
  426. element = driver.find_element(By.CSS_SELECTOR, "button[data-value='分享']")
  427. driver.implicitly_wait(5)
  428. ActionChains(driver).move_to_element(element).click(element).perform()
  429. time.sleep(0.5)
  430. for i in range(5):
  431. ele = driver.find_element(By.CSS_SELECTOR, "input")
  432. short_url = ele.get_attribute('value')
  433. unique_id = short_url.split('/')[-1]
  434. if len(unique_id) != 0:
  435. break
  436. time.sleep(0.5)
  437. element = driver.find_element(By.CSS_SELECTOR, "button[aria-label='關閉']")
  438. driver.implicitly_wait(5)
  439. ActionChains(driver).move_to_element(element).click(element).perform()
  440. return unique_id
  441. def page_down_(driver, xpath_css, time_):
  442. elmts = driver.find_elements_by_xpath(xpath_css)
  443. # print(xpath_css)
  444. print(elmts)
  445. # time.sleep(9999)
  446. if len(elmts)>1:
  447. elmt=elmts[1]
  448. else:
  449. elmt=elmts[0]
  450. actions = ActionChains(driver)
  451. actions.move_to_element(elmt).click().perform()
  452. for i in range(time_):
  453. try:
  454. actions = ActionChains(driver)
  455. actions.send_keys(Keys.PAGE_DOWN).perform()
  456. except:
  457. traceback.print_exc()
  458. time.sleep(0.5)
  459. def main():
  460. db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
  461. db2 = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  462. table2 = db2['swire_store_list']
  463. # keyword = '麻辣火鍋'
  464. # if len(sys.argv) >1:
  465. # keyword=sys.argv[1]
  466. # port=4444
  467. # if len(sys.argv) >2:
  468. # port=int(sys.argv[2])
  469. if len(sys.argv) > 1 :
  470. port=int(sys.argv[1])
  471. print('restart docker p{}'.format(port))
  472. os.system('sudo docker container restart p'+str(port))
  473. time.sleep(8)
  474. else:
  475. port = 2
  476. for i in range(10):
  477. result = db2.query('select * from swire_store_list where check_ is null ORDER BY RAND() limit 500')
  478. url_pd = pd.DataFrame([dict(i) for i in result])
  479. url_pd['item_url'] = url_pd['fid'].apply(lambda x: 'https://www.google.com.tw/maps/@24.1753633,120.6747136,15z/data=!4m5!3m4!1s{}!8m2!3d24.1760271!4d120.6705323'.format(x))
  480. # keyword = get_new_keyword(db2)
  481. # table2.insert({'kw':keyword,'num':0})
  482. # url_pd = get_not_cralwer_url(keyword)
  483. # print('drvier start {}...'.format(keyword))
  484. driver = brower_start(port)
  485. time.sleep(4)
  486. #driver = serive_create('Profile 6')
  487. #profilepath = 'Profile 1'
  488. #driver = serive_create_linux(profilepath)
  489. for key, row in url_pd.iterrows():
  490. try:
  491. name = row['name']
  492. item_url = row['item_url']
  493. print(key, name, ': ' ,item_url)
  494. print('start...')
  495. driver.get(item_url)
  496. # page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu']", 3)
  497. page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu-haAclf']", 3)
  498. # lat, lon = find_lon_lat(driver)
  499. # unique_id = get_unique_id(driver)
  500. time_status = time_click(driver)
  501. time.sleep(0.5)
  502. shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  503. output = {
  504. # 'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text),
  505. 'name': name,
  506. 'fid': row['fid']
  507. }
  508. print(output['name'])
  509. print('get_shop_info')
  510. output = get_shop_info(driver, output, shop_soup)
  511. print('get_intro_info')
  512. if len(shop_soup.select("div[aria-label='{}簡介']".format(output['name']))) != 0:
  513. output = get_intro_info(driver, output)
  514. else:
  515. for key in intro_list:
  516. output[intro_list[key][0]] = '[]'
  517. print('get_time_list')
  518. if time_status == '正常':
  519. output = get_time_list(shop_soup, output)
  520. else:
  521. output['open_now'] = False
  522. output['periods'] = ''
  523. output['weekday_text'] = ''
  524. print('user_ratings_total')
  525. if output['user_ratings_total'] == '':
  526. output['reviews'] = ''
  527. else:
  528. output = get_reviews(driver, output)
  529. print('find_big_photo')
  530. output = find_big_photo(output, driver)
  531. output_name = output['name'].replace('(','').replace(')', '')
  532. query_name = '{}+{}'.format(output_name, output['addr'])
  533. query_name = query_name.replace(' ','')
  534. output['item_url'] = item_url
  535. output['keyword'] = row['keyword']
  536. output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
  537. data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
  538. table2.upsert({'place_id':row['place_id'],'check_':1},['place_id'])
  539. except Exception as e:
  540. table3 = db2['error_list2']
  541. table3.insert({'num':row['name'],'keyword':row['keyword'],'item_url':row['item_url'],'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
  542. print(e)
  543. # error_table_col = ['name', 'keyword', 'item_url', 'crawler_date']
  544. # db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
  545. # data_select_insert(db, 'error_list2', error_table_col, row)
  546. time.sleep(1)
  547. if __name__ == '__main__':
  548. main()