run2.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.common.exceptions import NoSuchElementException
  8. from selenium.webdriver.common.by import By
  9. from bs4 import BeautifulSoup
  10. from utility import database_access as DA
  11. from utility.parseutils import *
  12. from utility.connect import *
  13. from datetime import datetime
  14. import traceback
  15. import dataset
  16. import pandas as pd
  17. import time
  18. import json
  19. import re
  20. import sys
  21. # import pyautogui as pag
  22. def serive_create(profilepath):
  23. option = webdriver.ChromeOptions()
  24. option.add_argument('--disable-web-security')
  25. option.add_argument('--allow-running-insecure-content')
  26. option.add_argument("--user-data-dir=C:\\Users\\user\\AppData\\Local\\Google\\Chrome\\User Data")
  27. option.add_argument("profile-directory="+profilepath)
  28. driver = webdriver.Chrome('./utility/chromedriver_win32/chromedriver', options=option)
  29. executor_url = driver.command_executor._url
  30. session_id = driver.session_id
  31. print (session_id)
  32. print (executor_url)
  33. time.sleep(3)
  34. return driver
  35. def brower_start(port):
  36. options = webdriver.ChromeOptions()
  37. # browser = webdriver.Chrome(options=options)
  38. browser = webdriver.Remote(
  39. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  40. #command_executor='http://192.53.174.202:'+str(port)+'/wd/hub',
  41. desired_capabilities=options.to_capabilities()
  42. )
  43. return browser
  44. def keyin_keyword(driver, keyword):
  45. button = driver.find_element_by_id("searchbox")
  46. driver.implicitly_wait(30)
  47. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  48. time.sleep(3)
  49. # element = driver.find_element_by_class_name("V0h1Ob-haAclf")
  50. # driver.implicitly_wait(30)
  51. # ActionChains(driver).move_to_element(element).click(element).perform()
  52. def open_time(driver):
  53. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
  54. if element.text.find('預訂') == -1:
  55. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
  56. driver.implicitly_wait(10)
  57. ActionChains(driver).move_to_element(element).click(element).perform()
  58. return 1
  59. else:
  60. return 0
  61. def get_shop_info(driver, output, shop_soup):
  62. current_url_split = driver.current_url.split('@')[1].split(',')
  63. output['lon'] = current_url_split[1]
  64. output['lat'] = current_url_split[0]
  65. location = shop_soup.find('button',{'data-item-id':'oloc'})['aria-label'].split(' ')
  66. output['city'] = location[-1]
  67. output['area'] = location[-2]
  68. try:
  69. output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].replace('地址:', '')
  70. except:
  71. output['addr'] = ''
  72. try:
  73. output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
  74. except:
  75. output['tel'] = ''
  76. print(output['addr'], ', ' ,output['tel'])
  77. for key in element_list:
  78. try:
  79. element = element_list[key]
  80. if len(element) == 3:
  81. value = shop_soup.find(element[0],element[1])[element[2]]
  82. else:
  83. tmp_value = shop_soup.find(element[0],element[1])
  84. if tmp_value:
  85. value = tmp_value.text
  86. else:
  87. value = ''
  88. output[key] = value_check(key, value)
  89. except:
  90. output[key] = ''
  91. return output
  92. def get_intro_info(driver, output):
  93. # element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]')
  94. try:
  95. element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}簡介']".format(output['name']))
  96. driver.implicitly_wait(5)
  97. ActionChains(driver).move_to_element(element).click(element).perform()
  98. # pageSource = driver.page_source
  99. # fileToWrite = open("page_source.html", "w")
  100. # fileToWrite.write(pageSource)
  101. # fileToWrite.close()
  102. page_down_(driver, '//*[@id="pane"]/div/div[1]', 3)
  103. intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
  104. for key in intro_list:
  105. elements = intro_soup.find('div',{'aria-label':key})
  106. if elements:
  107. element = elements.find_all('li',{'class':'LQjNnc-p83tee-JNdkSc-ibnC6b'})
  108. count = 0
  109. tmp = []
  110. for ele in element:
  111. # if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/2x/check_black_18dp.png"}):
  112. if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/1x/check_black_18dp.png"}):
  113. tmp += [{
  114. 'id':count,
  115. intro_list[key][1]: blank_check(ele.text)
  116. }]
  117. count += 1
  118. print(str(tmp))
  119. output[intro_list[key][0]] = str(tmp)
  120. else:
  121. output[intro_list[key][0]] = '[]'
  122. driver.back()
  123. return output
  124. except:
  125. for key in intro_list:
  126. output[intro_list[key][0]] = '[]'
  127. return output
  128. def get_time_list(shop_soup, output):
  129. periods = []
  130. weekday_text = []
  131. open_now = blank_check(shop_soup.find('span', {'class':'LJKBpe-Tswv1b-hour-text'}).text.split('\xa0')[0])
  132. if open_now == '永久停業' or open_now == '暫時關閉':
  133. output['open_now'] = 'False'
  134. else:
  135. output['open_now'] = 'True'
  136. for tr_ in shop_soup.find_all('tr'):
  137. if tr_.find('div').text.replace(' ','') != '':
  138. week = tr_.find('div').text
  139. time_list = [blank_check(i.text) for i in tr_.find_all('li')]
  140. for time_ in time_list:
  141. if time_ == '24 小時營業':
  142. periods += [{
  143. "open":{
  144. "day": week_list[week],
  145. "time": 0000
  146. },
  147. "close":{
  148. "day": week_list[week],
  149. "time": ''
  150. }
  151. }]
  152. elif time_ == '休息':
  153. periods += [{
  154. "open":{
  155. "day": week_list[week],
  156. "time": ''
  157. },
  158. "close":{
  159. "day": week_list[week],
  160. "time": ''
  161. }
  162. }]
  163. else:
  164. start, end = time_.split('–')
  165. end_hour, end_min = end.split(':')
  166. start_hour, start_min = start.split(':')
  167. if end_hour < start_hour:
  168. end_day = week_list[week] + 1
  169. else:
  170. end_day = week_list[week]
  171. periods += [{
  172. "open":{
  173. "day": week_list[week],
  174. "time": start.replace(':','')
  175. },
  176. "close":{
  177. "day": end_day,
  178. "time": end.replace(':','')
  179. }
  180. }]
  181. weekday_text += ["{}: {}".format(week, ', '.join(time_list))]
  182. output['periods'] = str(periods)
  183. output['weekday_text'] = str(weekday_text)
  184. return output
  185. def get_reviews(driver, output):
  186. wait = WebDriverWait(driver, 30)
  187. more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
  188. wait.until(
  189. EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
  190. )
  191. element = driver.find_element_by_css_selector(more_reviews_css)
  192. driver.implicitly_wait(10)
  193. ActionChains(driver).move_to_element(element).click(element).perform()
  194. time.sleep(0.5)
  195. # page_down_(driver, '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]', 5)
  196. page_down_(driver, '//div[@class="PPCwl"]',5)
  197. comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
  198. if comment_soup.find_all('div',class_='ODSEW-ShBeI-xJzy8c-bF1uUb') != 0:
  199. all_photo = driver.find_elements_by_class_name('ODSEW-ShBeI-xJzy8c-bF1uUb')
  200. for ap in all_photo:
  201. ap.click()
  202. if comment_soup.select('button[aria-label="顯示更多"]') != 0:
  203. all_review = driver.find_elements_by_css_selector('button[aria-label="顯示更多"]')
  204. for ap in all_review:
  205. ap.click()
  206. comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
  207. count = 0
  208. reviews = []
  209. for comment in comment_soup.find_all('div',{'class':'ODSEW-ShBeI'}):
  210. comment_a_tag = comment.find_all('a')
  211. author_name = blank_check(comment_a_tag[1].find('div', class_= 'ODSEW-ShBeI-title').text)
  212. profile_photo_url = comment_a_tag[0].find('img')['src']
  213. rating = blank_check(comment.find('span',{'role':'img'})['aria-label'].replace('顆星', ''))
  214. text = comment.find('div', class_='ODSEW-ShBeI-ShBeI-content').text
  215. created_at = comment.find('span', class_='ODSEW-ShBeI-RgZmSc-date').text
  216. photos = []
  217. c = 0
  218. for i in comment.find_all('button', class_='ODSEW-ShBeI-xJzy8c'):
  219. path = i['style'].split(';')[0].split('url')[1].replace('\"','').replace('(','').replace(')','')
  220. photos += [path]
  221. c += 1
  222. reviews += [{
  223. 'id': comment.find('a')['href'].split('/')[5],
  224. 'author_name': author_name,
  225. 'profile_photo_url': profile_photo_url,
  226. 'rating': int(rating),
  227. 'text': text,
  228. 'created_at': created_at,
  229. 'photos': photos
  230. }]
  231. count += 1
  232. output['reviews'] = str(reviews)
  233. driver.back()
  234. return output
  235. # def get_photo(output, shop_soup):
  236. # shop_photo = {}
  237. # for i in shop_soup.find('div',{'aria-label':'{}的相片'.format(output['name'])}).find_all('button'):
  238. # try:
  239. # if i['aria-label'] == '街景服務和 360 度相片' or i['aria-label'] == '影片':
  240. # continue
  241. # shop_photo[i['aria-label']] = i.find('img')['src']
  242. # except:
  243. # pass
  244. # output['shop_photo'] = shop_photo
  245. # return output
  246. def find_photo_list(driver):
  247. time.sleep(0.5)
  248. wait = WebDriverWait(driver, 60)
  249. wait.until(
  250. EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a'))
  251. )
  252. page_down_(driver,'//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a' , 10)
  253. photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
  254. photo_url = []
  255. count = 0
  256. for i in photo_soup.find_all('a', class_='mWq4Rd-eEDwDf'):
  257. if count > 5: break
  258. a_url = i.find('div', class_='mWq4Rd-HiaYvf-CNusmb-gevUs loaded')
  259. if a_url:
  260. if a_url.find('width') != -1:
  261. sentence = a_url['style']
  262. photo = re.search(r'https:(.*)\"', sentence)
  263. photo_url += [photo.group(0).replace('\"','')]
  264. count += 1
  265. return photo_url
  266. def find_big_photo(output, driver):
  267. # element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
  268. wait = WebDriverWait(driver, 60)
  269. wait.until(
  270. EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[1]/div[1]/button'))
  271. )
  272. element = driver.find_element(By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[1]/div[1]/button')
  273. ActionChains(driver).move_to_element(element).click(element).perform()
  274. output['shop_photo'] = '[]'
  275. output['menu_photo'] = '[]'
  276. photo_map = {
  277. '全部': 'shop_photo',
  278. '菜單': 'menu_photo'
  279. }
  280. driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='1']")
  281. photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
  282. tab_dict = {}
  283. for tab_index in [0, 1, 2]:
  284. selector = photo_soup.select("button[data-tab-index='{}']".format(tab_index))
  285. if len(selector) != 0:
  286. photo_name = selector[0].text
  287. if photo_name == '菜單':
  288. tab_dict[photo_name] = tab_index
  289. elif photo_name == '全部':
  290. tab_dict[photo_name] = tab_index
  291. print(tab_dict)
  292. for tab_ in tab_dict:
  293. tab_index = tab_dict[tab_]
  294. print(tab_index)
  295. wait = WebDriverWait(driver, 60)
  296. wait.until(
  297. EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
  298. )
  299. element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
  300. ActionChains(driver).move_to_element(element).click(element).perform()
  301. photo_list = find_photo_list(driver)
  302. output[photo_map[tab_]] = str(photo_list)
  303. return output
  304. def get_url_list(driver):
  305. # wait = WebDriverWait(driver, 10)
  306. # wait.until(
  307. # EC.element_to_be_clickable((By.XPATH, '//*[@id="sGi9mc-m5SR9c-bottom-pane"]/div/div[1]/div/div/div/div[1]/div[2]/div[2]'))
  308. # )
  309. # driver.back()
  310. time.sleep(2)
  311. for i in range(5, 43, 2):
  312. driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
  313. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  314. url_list = []
  315. for i in url_soup.find_all('a'):
  316. try:
  317. if i['href'].find('maps/place') != -1:
  318. url_list += [[i['href'], i['aria-label']]]
  319. except:
  320. pass
  321. return url_list
  322. def data_select_insert(db, table_name, table_col, data):
  323. tmp = []
  324. for name_ in table_col:
  325. if name_ == 'crawler_date':
  326. continue
  327. if name_ == 'lon' or name_ == 'lat':
  328. tmp += [float(data[name_])]
  329. else:
  330. tmp += [data[name_]]
  331. tmp += [datetime.today().strftime("%Y/%m/%d %H:%M")]
  332. insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
  333. .format(table_name, str(tuple(table_col)).replace('\'',''), tuple(tmp))
  334. DA.mysql_insert_data(db, insert_sql)
  335. def time_click(driver):
  336. shop_soup_tmp = BeautifulSoup(driver.page_source, 'html.parser')
  337. status = ''
  338. try:
  339. if len(shop_soup_tmp.select("span[aria-label='顯示本週營業時間']")) != 0:
  340. time_css = "span[aria-label='顯示本週營業時間']"
  341. element = driver.find_element_by_css_selector(time_css)
  342. driver.implicitly_wait(10)
  343. ActionChains(driver).move_to_element(element).click(element).perform()
  344. status = '正常'
  345. elif len(shop_soup_tmp.select("img[aria-label='通知']")) != 0:
  346. status = shop_soup_tmp.find('span',class_='LJKBpe-Tswv1b-text aSftqf').text
  347. # status = '永久停業' or '暫時關閉'
  348. elif len(shop_soup_tmp.select('button[aria-label*="查看更詳細的營業時間"]')) != 0:
  349. status = 'error'
  350. return status
  351. except:
  352. return ''
  353. def get_not_cralwer_url(keyword):
  354. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  355. table = db['shop_item_list']
  356. url_list = list(table.find(keyword=keyword))
  357. shop_item = [i['item_url'] for i in db.query('SELECT item_url FROM shop_list where keyword="{}"'.format(keyword))]
  358. error_item = [i['item_url'] for i in db.query('SELECT item_url FROM error_list where keyword="{}"'.format(keyword))]
  359. url_pd = pd.DataFrame(url_list, columns=url_list[0].keys())
  360. url_pd['item_url_length'] = url_pd.item_url.apply(lambda x: len(x))
  361. url_pd = url_pd[(url_pd['item_url_length']!=1000) & (url_pd['item_url_length']!=600)]
  362. url_pd = url_pd[~url_pd['item_url'].isin(shop_item)]
  363. url_pd = url_pd[~url_pd['item_url'].isin(error_item)]
  364. print('have {} URL list'.format(len(url_pd)))
  365. # url_list = pd.read_csv('result/shop_item_list_20211210.csv', index_col=0)
  366. return url_pd
  367. def serive_create_linux(profilepath):
  368. option = webdriver.ChromeOptions()
  369. option.add_argument('--headless')
  370. option.add_argument('--no-sandbox')
  371. option.add_argument('--disable-web-security')
  372. option.add_argument('--allow-running-insecure-content')
  373. option.add_argument('--incognito')
  374. option.add_argument(
  375. 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0')
  376. # option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data")
  377. option.add_argument(
  378. "--user-data-dir=/home/noodlesloves/.config/google-chrome/")
  379. option.add_argument("profile-directory="+profilepath)
  380. driver = webdriver.Chrome('utility/chromedriver', options=option)
  381. # driver = webdriver.Chrome(executable_path='/usr/bin/chromedriver', chrome_options=option,
  382. # service_args=['--verbose', '--log-path=/tmp/chromedriver.log'])
  383. executor_url = driver.command_executor._url
  384. session_id = driver.session_id
  385. print(session_id)
  386. print(executor_url)
  387. return driver
  388. def page_down_(driver, xpath_css, time_):
  389. elmts = driver.find_elements_by_xpath(xpath_css)
  390. print(elmts)
  391. if len(elmts)>1:
  392. elmt=elmts[1]
  393. else:
  394. elmt=elmts[0]
  395. actions = ActionChains(driver)
  396. actions.move_to_element(elmt).click().perform()
  397. for i in range(time_):
  398. try:
  399. actions = ActionChains(driver)
  400. actions.send_keys(Keys.PAGE_DOWN).perform()
  401. except:
  402. traceback.print_exc()
  403. time.sleep(0.5)
  404. def main():
  405. #db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
  406. keyword = '麻辣火鍋'
  407. if len(sys.argv) >1:
  408. keyword=sys.argv[1]
  409. port=4448
  410. if len(sys.argv) >2:
  411. port=int(sys.argv[2])
  412. for keyword in ['鳳梨酥','蔥油餅','滷肉飯']:
  413. db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
  414. url_pd = get_not_cralwer_url(keyword)
  415. print('drvier start {}...'.format(keyword))
  416. driver = brower_start(port)
  417. #driver = serive_create('Profile 6')
  418. #profilepath = 'Profile 1'
  419. #driver = serive_create_linux(profilepath)
  420. for key, row in url_pd.iterrows():
  421. try:
  422. name = row['name']
  423. item_url = row['item_url']
  424. print(key, name, ': ' ,item_url)
  425. print('start...')
  426. driver.get(item_url)
  427. page_down_(driver, "//div[@class='x3AX1-LfntMc-header-title-ij8cu']", 3)
  428. time_status = time_click(driver)
  429. if time_status == 'error':
  430. error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
  431. data_select_insert(db, 'error_list', error_table_col, row)
  432. continue
  433. time.sleep(0.5)
  434. shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  435. output = {
  436. 'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text)
  437. }
  438. print(output['name'])
  439. print('get_shop_info')
  440. output = get_shop_info(driver, output, shop_soup)
  441. print('get_intro_info')
  442. if len(shop_soup.select("div[aria-label='{}簡介']".format(output['name']))) != 0:
  443. output = get_intro_info(driver, output)
  444. else:
  445. for key in intro_list:
  446. output[intro_list[key][0]] = '[]'
  447. print('get_time_list')
  448. if time_status == '正常':
  449. output = get_time_list(shop_soup, output)
  450. else:
  451. output['open_now'] = False
  452. output['periods'] = ''
  453. output['weekday_text'] = ''
  454. print('user_ratings_total')
  455. if output['user_ratings_total'] == '':
  456. output['reviews'] = ''
  457. else:
  458. output = get_reviews(driver, output)
  459. print('find_big_photo')
  460. output = find_big_photo(output, driver)
  461. output_name = output['name'].replace('(','').replace(')', '')
  462. query_name = '{}+{}'.format(output_name, output['addr'])
  463. query_name = query_name.replace(' ','')
  464. output['item_url'] = item_url
  465. output['keyword'] = keyword
  466. output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
  467. data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
  468. except Exception as e:
  469. print(e)
  470. error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
  471. data_select_insert(db, 'error_list', error_table_col, row)
  472. time.sleep(1)
  473. # driver.close()
  474. # driver = brower_start(port)
  475. # driver = serive_create_linux(profilepath)
  476. if __name__ == '__main__':
  477. main()