run.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.common.exceptions import NoSuchElementException
  8. from selenium.webdriver.common.by import By
  9. from bs4 import BeautifulSoup
  10. from utility import database_access as DA
  11. from utility.parseutils import *
  12. from utility.connect import *
  13. from datetime import datetime
  14. import dataset
  15. import pandas as pd
  16. import time
  17. import json
  18. import re
  19. # import pyautogui as pag
  20. def serive_create(profilepath):
  21. option = webdriver.ChromeOptions()
  22. option.add_argument('--disable-web-security')
  23. option.add_argument('--allow-running-insecure-content')
  24. option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data")
  25. option.add_argument("profile-directory="+profilepath)
  26. driver = webdriver.Chrome('./utility/chromedriver_20211103/chromedriver', options=option)
  27. executor_url = driver.command_executor._url
  28. session_id = driver.session_id
  29. print (session_id)
  30. print (executor_url)
  31. time.sleep(3)
  32. return driver
  33. def brower_start():
  34. options = webdriver.ChromeOptions()
  35. browser = webdriver.Remote(
  36. command_executor='http://192.53.174.202:4444/wd/hub',
  37. desired_capabilities=options.to_capabilities()
  38. )
  39. return browser
  40. def keyin_keyword(driver, keyword):
  41. button = driver.find_element_by_id("searchbox")
  42. driver.implicitly_wait(30)
  43. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  44. time.sleep(3)
  45. # element = driver.find_element_by_class_name("V0h1Ob-haAclf")
  46. # driver.implicitly_wait(30)
  47. # ActionChains(driver).move_to_element(element).click(element).perform()
  48. def open_time(driver):
  49. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
  50. if element.text.find('預訂') == -1:
  51. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
  52. driver.implicitly_wait(20)
  53. ActionChains(driver).move_to_element(element).click(element).perform()
  54. return 1
  55. else:
  56. return 0
  57. def get_shop_info(driver, output, shop_soup):
  58. current_url_split = driver.current_url.split('@')[1].split(',')
  59. output['lon'] = current_url_split[1]
  60. output['lat'] = current_url_split[0]
  61. location = shop_soup.find('button',{'data-item-id':'oloc'})['aria-label'].split(' ')
  62. output['city'] = location[-1]
  63. output['area'] = location[-2]
  64. output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].replace('地址:', '')
  65. output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
  66. print(output['addr'], ', ' ,output['tel'])
  67. for key in element_list:
  68. element = element_list[key]
  69. if len(element) == 3:
  70. value = shop_soup.find(element[0],element[1])[element[2]]
  71. else:
  72. tmp_value = shop_soup.find(element[0],element[1])
  73. if tmp_value:
  74. value = tmp_value.text
  75. else:
  76. value = ''
  77. output[key] = value_check(key, value)
  78. return output
  79. def get_intro_info(driver, output):
  80. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]')
  81. driver.implicitly_wait(20)
  82. ActionChains(driver).move_to_element(element).click(element).perform()
  83. for i in range(5, 35, 3):
  84. try:
  85. element = driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[{}]'.format(i))
  86. actions = ActionChains(driver)
  87. actions.move_to_element(element).perform()
  88. except:
  89. break
  90. intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
  91. for key in intro_list:
  92. elements = intro_soup.find('div',{'aria-label':key})
  93. if elements:
  94. element = elements.find_all('li',{'class':'LQjNnc-p83tee-JNdkSc-ibnC6b'})
  95. count = 0
  96. tmp = []
  97. for ele in element:
  98. # if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/2x/check_black_18dp.png"}):
  99. if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/1x/check_black_18dp.png"}):
  100. tmp += [{
  101. 'id':count,
  102. intro_list[key][1]: blank_check(ele.text)
  103. }]
  104. count += 1
  105. print(str(tmp))
  106. output[intro_list[key][0]] = str(tmp)
  107. else:
  108. output[intro_list[key][0]] = '[]'
  109. driver.back()
  110. return output
  111. def get_time_list(shop_soup, output):
  112. periods = []
  113. weekday_text = []
  114. open_now = blank_check(shop_soup.find('span', {'class':'LJKBpe-Tswv1b-hour-text'}).text.split('\xa0')[0])
  115. if open_now == '永久停業' or open_now == '暫時關閉':
  116. output['open_now'] = 'False'
  117. else:
  118. output['open_now'] = 'True'
  119. for tr_ in shop_soup.find_all('tr'):
  120. if tr_.find('div').text.replace(' ','') != '':
  121. week = tr_.find('div').text
  122. time_list = [blank_check(i.text) for i in tr_.find_all('li')]
  123. for time_ in time_list:
  124. if time_ == '24 小時營業':
  125. periods += [{
  126. "open":{
  127. "day": week_list[week],
  128. "time": 0000
  129. },
  130. "close":{
  131. "day": week_list[week],
  132. "time": ''
  133. }
  134. }]
  135. elif time_ == '休息':
  136. periods += [{
  137. "open":{
  138. "day": week_list[week],
  139. "time": ''
  140. },
  141. "close":{
  142. "day": week_list[week],
  143. "time": ''
  144. }
  145. }]
  146. else:
  147. start, end = time_.split('–')
  148. end_hour, end_min = end.split(':')
  149. start_hour, start_min = start.split(':')
  150. if end_hour < start_hour:
  151. end_day = week_list[week] + 1
  152. else:
  153. end_day = week_list[week]
  154. periods += [{
  155. "open":{
  156. "day": week_list[week],
  157. "time": start.replace(':','')
  158. },
  159. "close":{
  160. "day": end_day,
  161. "time": end.replace(':','')
  162. }
  163. }]
  164. weekday_text += ["{}: {}".format(week, ', '.join(time_list))]
  165. output['periods'] = str(periods)
  166. output['weekday_text'] = str(weekday_text)
  167. return output
  168. def get_reviews(driver, output):
  169. wait = WebDriverWait(driver, 30)
  170. more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
  171. wait.until(
  172. EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
  173. )
  174. element = driver.find_element_by_css_selector(more_reviews_css)
  175. driver.implicitly_wait(20)
  176. ActionChains(driver).move_to_element(element).click(element).perform()
  177. time.sleep(1)
  178. all_photo = driver.find_elements_by_class_name('ODSEW-ShBeI-xJzy8c-bF1uUb')
  179. for ap in all_photo:
  180. ap.click()
  181. all_review = driver.find_elements_by_css_selector('button[aria-label="顯示更多"')
  182. for ap in all_review:
  183. ap.click()
  184. comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
  185. count = 0
  186. reviews = []
  187. for comment in comment_soup.find_all('div',{'class':'ODSEW-ShBeI'}):
  188. comment_a_tag = comment.find_all('a')
  189. author_name = blank_check(comment_a_tag[1].find('div', class_= 'ODSEW-ShBeI-title').text)
  190. profile_photo_url = comment_a_tag[0].find('img')['src']
  191. rating = blank_check(comment.find('span',{'role':'img'})['aria-label'].replace('顆星', ''))
  192. text = comment.find('div', class_='ODSEW-ShBeI-ShBeI-content').text
  193. created_at = comment.find('span', class_='ODSEW-ShBeI-RgZmSc-date').text
  194. photos = []
  195. c = 0
  196. for i in comment.find_all('button', class_='ODSEW-ShBeI-xJzy8c'):
  197. path = i['style'].split(';')[0].split('url')[1].replace('\"','').replace('(','').replace(')','')
  198. photos += [path]
  199. c += 1
  200. reviews += [{
  201. 'id': comment.find('a')['href'].split('/')[5],
  202. 'author_name': author_name,
  203. 'profile_photo_url': profile_photo_url,
  204. 'rating': int(rating),
  205. 'text': text,
  206. 'created_at': created_at,
  207. 'photos': photos
  208. }]
  209. count += 1
  210. output['reviews'] = str(reviews)
  211. driver.back()
  212. return output
  213. # def get_photo(output, shop_soup):
  214. # shop_photo = {}
  215. # for i in shop_soup.find('div',{'aria-label':'{}的相片'.format(output['name'])}).find_all('button'):
  216. # try:
  217. # if i['aria-label'] == '街景服務和 360 度相片' or i['aria-label'] == '影片':
  218. # continue
  219. # shop_photo[i['aria-label']] = i.find('img')['src']
  220. # except:
  221. # pass
  222. # output['shop_photo'] = shop_photo
  223. # return output
  224. def find_photo_list(driver):
  225. time.sleep(2)
  226. wait = WebDriverWait(driver, 60)
  227. wait.until(
  228. EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a'))
  229. )
  230. count_list = []
  231. for i in range(1, 6):
  232. try:
  233. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[{}]/div/a'.format(i))
  234. count_list += [element.get_attribute('data-photo-index')]
  235. actions = ActionChains(driver)
  236. actions.move_to_element(element).perform()
  237. except:
  238. break
  239. time.sleep(1)
  240. photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
  241. photo_url = []
  242. for photo_id in count_list:
  243. for i in photo_soup.select('a[data-photo-index="{}"]'.format(photo_id))[0].find_all('div'):
  244. if i['style'].find('width') != -1:
  245. sentence = i['style']
  246. photo = re.search(r'https:(.*)\"', sentence)
  247. print(sentence)
  248. photo_url += [photo.group(0).replace('\"','')]
  249. break
  250. return photo_url
  251. def find_big_photo(output, driver):
  252. element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
  253. ActionChains(driver).move_to_element(element).click(element).perform()
  254. output['shop_photo'] = '[]'
  255. output['menu_photo'] = '[]'
  256. photo_map = {
  257. '全部': 'shop_photo',
  258. '菜單': 'menu_photo'
  259. }
  260. tab_dict = {}
  261. for tab_index in [0, 1, 2]:
  262. photo_name = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)).text
  263. if photo_name == '菜單':
  264. tab_dict[photo_name] = tab_index
  265. elif photo_name == '全部':
  266. tab_dict[photo_name] = tab_index
  267. print(tab_dict)
  268. for tab_ in tab_dict:
  269. tab_index = tab_dict[tab_]
  270. print(tab_index)
  271. wait = WebDriverWait(driver, 60)
  272. wait.until(
  273. EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
  274. )
  275. element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
  276. ActionChains(driver).move_to_element(element).click(element).perform()
  277. photo_list = find_photo_list(driver)
  278. output[photo_map[tab_]] = str(photo_list)
  279. return output
  280. def get_url_list(driver):
  281. # wait = WebDriverWait(driver, 10)
  282. # wait.until(
  283. # EC.element_to_be_clickable((By.XPATH, '//*[@id="sGi9mc-m5SR9c-bottom-pane"]/div/div[1]/div/div/div/div[1]/div[2]/div[2]'))
  284. # )
  285. # driver.back()
  286. time.sleep(2)
  287. for i in range(5, 43, 2):
  288. driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
  289. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  290. url_list = []
  291. for i in url_soup.find_all('a'):
  292. try:
  293. if i['href'].find('maps/place') != -1:
  294. url_list += [[i['href'], i['aria-label']]]
  295. except:
  296. pass
  297. return url_list
  298. def data_select_insert(db, table_name, table_col, data):
  299. tmp = []
  300. for name_ in table_col:
  301. if name_ == 'crawler_date':
  302. continue
  303. if name_ == 'lon' or name_ == 'lat':
  304. tmp += [float(data[name_])]
  305. else:
  306. tmp += [data[name_]]
  307. tmp += [datetime.today().strftime("%Y/%m/%d %H:%M")]
  308. insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
  309. .format(table_name, str(tuple(table_col)).replace('\'',''), tuple(tmp))
  310. DA.mysql_insert_data(db, insert_sql)
  311. def time_click(driver):
  312. status = ''
  313. try:
  314. time_css = "span[aria-label='顯示本週營業時間']"
  315. element = driver.find_element_by_css_selector(time_css)
  316. driver.implicitly_wait(30)
  317. ActionChains(driver).move_to_element(element).click(element).perform()
  318. status = '正常'
  319. except NoSuchElementException:
  320. time_css = "div[aria-expanded='false']"
  321. elem = driver.find_element_by_css_selector(time_css)
  322. if elem:
  323. status = '暫時關閉'
  324. return status
  325. def get_not_cralwer_url(keyword):
  326. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/google_poi?charset=utf8mb4')
  327. table = db['shop_item_list']
  328. url_list = list(table.find(keyword=keyword))
  329. shop_item = [i['item_url'] for i in db.query('SELECT item_url FROM shop_list where keyword="{}"'.format(keyword))]
  330. error_item = [i['item_url'] for i in db.query('SELECT item_url FROM error_list where keyword="{}"'.format(keyword))]
  331. url_pd = pd.DataFrame(url_list, columns=url_list[0].keys())
  332. url_pd['item_url_length'] = url_pd.item_url.apply(lambda x: len(x))
  333. url_pd = url_pd[url_pd['item_url_length']!=1000]
  334. url_pd = url_pd[~url_pd['item_url'].isin(shop_item)]
  335. url_pd = url_pd[~url_pd['item_url'].isin(error_item)]
  336. print('have {} URL list'.format(len(url_pd)))
  337. # url_list = pd.read_csv('result/shop_item_list_20211210.csv', index_col=0)
  338. return url_pd
  339. def serive_create_linux(profilepath):
  340. option = webdriver.ChromeOptions()
  341. option.add_argument('--headless')
  342. option.add_argument('--no-sandbox')
  343. option.add_argument('--disable-web-security')
  344. option.add_argument('--allow-running-insecure-content')
  345. option.add_argument('--incognito')
  346. option.add_argument(
  347. 'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0')
  348. # option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data")
  349. option.add_argument(
  350. "--user-data-dir=/home/noodlesloves/.config/google-chrome/")
  351. option.add_argument("profile-directory="+profilepath)
  352. driver = webdriver.Chrome('utility/chromedriver', options=option)
  353. # driver = webdriver.Chrome(executable_path='/usr/bin/chromedriver', chrome_options=option,
  354. # service_args=['--verbose', '--log-path=/tmp/chromedriver.log'])
  355. executor_url = driver.command_executor._url
  356. session_id = driver.session_id
  357. print(session_id)
  358. print(executor_url)
  359. return driver
  360. def main():
  361. keyword = '咖啡'
  362. db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
  363. url_pd = get_not_cralwer_url(keyword)
  364. print('drvier start...')
  365. driver = brower_start()
  366. # driver = serive_create('Profile 1')
  367. # profilepath = 'Profile 1'
  368. # driver = serive_create_linux(profilepath)
  369. for key, row in url_pd.iterrows():
  370. try:
  371. name = row['name']
  372. item_url = row['item_url']
  373. print(key, name, ': ' ,item_url)
  374. driver.get(item_url)
  375. for i in range(4, 26, 2):
  376. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[{}]'.format(i))
  377. actions = ActionChains(driver)
  378. actions.move_to_element(element).perform()
  379. time.sleep(0.5)
  380. print('start...')
  381. time_status = time_click(driver)
  382. time.sleep(0.5)
  383. shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  384. output = {
  385. 'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text)
  386. }
  387. print(output['name'])
  388. output = get_shop_info(driver, output, shop_soup)
  389. output = get_intro_info(driver, output)
  390. output = get_time_list(shop_soup, output)
  391. output = get_reviews(driver, output)
  392. output = find_big_photo(output, driver)
  393. output_name = output['name'].replace('(','').replace(')', '')
  394. query_name = '{}+{}'.format(output_name, output['addr'])
  395. query_name = query_name.replace(' ','')
  396. output['item_url'] = item_url
  397. output['keyword'] = keyword
  398. output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
  399. data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
  400. except:
  401. error_table_col = ['name', 'lon', 'lat', 'keyword', 'item_url', 'crawler_date']
  402. data_select_insert(db, 'error_list', error_table_col, row)
  403. driver.close()
  404. driver = brower_start()
  405. # driver = serive_create_linux(profilepath)
  406. if __name__ == '__main__':
  407. main()