run.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. from selenium import webdriver
  2. from selenium.webdriver.common.action_chains import ActionChains
  3. from selenium.webdriver.common.keys import Keys
  4. from bs4 import BeautifulSoup
  5. from utility.parseutils import element_list, intro_list, week_list, value_check, blank_check
  6. import pandas as pd
  7. import time
  8. import json
  9. def serive_create(profilepath):
  10. option = webdriver.ChromeOptions()
  11. option.add_argument('--disable-web-security')
  12. option.add_argument('--allow-running-insecure-content')
  13. option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data")
  14. option.add_argument("profile-directory="+profilepath)
  15. driver = webdriver.Chrome('./utility/chromedriver_20211103/chromedriver', options=option)
  16. executor_url = driver.command_executor._url
  17. session_id = driver.session_id
  18. print (session_id)
  19. print (executor_url)
  20. time.sleep(3)
  21. return driver
  22. def brower_start():
  23. options = webdriver.ChromeOptions()
  24. browser = webdriver.Remote(
  25. command_executor='http://192.53.174.202:4444/wd/hub',
  26. desired_capabilities=options.to_capabilities()
  27. )
  28. return browser
  29. def keyin_keyword(driver, keyword):
  30. button = driver.find_element_by_id("searchbox")
  31. driver.implicitly_wait(20)
  32. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  33. time.sleep(3)
  34. element = driver.find_element_by_class_name("V0h1Ob-haAclf")
  35. driver.implicitly_wait(20)
  36. ActionChains(driver).move_to_element(element).click(element).perform()
  37. def open_time(driver):
  38. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
  39. if element.text.find('預訂') == -1:
  40. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
  41. driver.implicitly_wait(20)
  42. ActionChains(driver).move_to_element(element).click(element).perform()
  43. return 1
  44. else:
  45. return 0
  46. def get_shop_info(driver, output, shop_soup):
  47. current_url_split = driver.current_url.split('@')[1].split(',')
  48. output['lon'] = current_url_split[1]
  49. output['lat'] = current_url_split[0]
  50. location = shop_soup.find('button',{'data-item-id':'oloc'})['aria-label'].split(' ')
  51. output['city'] = location[-1]
  52. output['area'] = location[-2]
  53. print(location)
  54. output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].split(' ')[1]
  55. output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
  56. print(output['addr'], output['tel'])
  57. for key in element_list:
  58. element = element_list[key]
  59. if len(element) == 3:
  60. value = shop_soup.find(element[0],element[1])[element[2]]
  61. else:
  62. tmp_value = shop_soup.find(element[0],element[1])
  63. if tmp_value:
  64. value = tmp_value.text
  65. else:
  66. value = ''
  67. output[key] = value_check(key, value)
  68. return output
  69. def get_intro_info(driver, output):
  70. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]')
  71. driver.implicitly_wait(20)
  72. ActionChains(driver).move_to_element(element).click(element).perform()
  73. intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
  74. for key in intro_list:
  75. elements = intro_soup.find('div',{'aria-label':key})
  76. # print(elements)
  77. if elements:
  78. element = elements.find_all('li',{'class':'LQjNnc-p83tee-JNdkSc-ibnC6b'})
  79. # print(element)
  80. count = 0
  81. tmp = []
  82. for ele in element:
  83. if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/2x/check_black_18dp.png"}):
  84. tmp += [{
  85. 'id':count,
  86. intro_list[key][1]: blank_check(ele.text)
  87. }]
  88. count += 1
  89. output[intro_list[key][0]] = str(tmp)
  90. else:
  91. output[intro_list[key][0]] = []
  92. driver.back()
  93. time.sleep(2)
  94. return output
  95. def get_time_list(shop_soup, output):
  96. open_now = blank_check(shop_soup.find('span', {'class':'LJKBpe-Tswv1b-hour-text'}).text.split('\xa0')[0])
  97. if open_now == '永久停業' or open_now == '暫時關閉':
  98. output['open_now'] = 'False'
  99. else:
  100. output['open_now'] = 'True'
  101. periods = []
  102. weekday_text = []
  103. for tr_ in shop_soup.find_all('tr'):
  104. if tr_.find('div').text.replace(' ','') != '':
  105. week = tr_.find('div').text
  106. time_list = [blank_check(i.text) for i in tr_.find_all('li')]
  107. for time_ in time_list:
  108. if time_ == '24 小時營業':
  109. periods += [{
  110. "open":{
  111. "day": week_list[week],
  112. "time": 0000
  113. },
  114. "close":{
  115. "day": week_list[week],
  116. "time": ''
  117. }
  118. }]
  119. elif time_ == '休息':
  120. periods += [{
  121. "open":{
  122. "day": week_list[week],
  123. "time": ''
  124. },
  125. "close":{
  126. "day": week_list[week],
  127. "time": ''
  128. }
  129. }]
  130. else:
  131. start, end = time_.split('–')
  132. end_hour, end_min = end.split(':')
  133. start_hour, start_min = start.split(':')
  134. if end_hour < start_hour:
  135. end_day = week_list[week] + 1
  136. else:
  137. end_day = week_list[week]
  138. periods += [{
  139. "open":{
  140. "day": week_list[week],
  141. "time": start.replace(':','')
  142. },
  143. "close":{
  144. "day": end_day,
  145. "time": end.replace(':','')
  146. }
  147. }]
  148. weekday_text += ["{}: {}".format(week, ', '.join(time_list))]
  149. output['periods'] = str(periods)
  150. output['weekday_text'] = str(weekday_text)
  151. return output
  152. def get_reviews(driver, output):
  153. element = driver.find_element_by_css_selector("button[jsaction='pane.reviewChart.moreReviews']")
  154. driver.implicitly_wait(20)
  155. ActionChains(driver).move_to_element(element).click(element).perform()
  156. time.sleep(2)
  157. all_photo = driver.find_elements_by_class_name('ODSEW-ShBeI-xJzy8c-bF1uUb')
  158. for ap in all_photo:
  159. ap.click()
  160. all_review = driver.find_elements_by_css_selector('button[aria-label="顯示更多"')
  161. for ap in all_review:
  162. ap.click()
  163. comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
  164. count = 0
  165. reviews = []
  166. for comment in comment_soup.find_all('div',{'class':'ODSEW-ShBeI'}):
  167. comment_a_tag = comment.find_all('a')
  168. author_name = blank_check(comment_a_tag[1].find('div', class_= 'ODSEW-ShBeI-title').text)
  169. profile_photo_url = comment_a_tag[0].find('img')['src']
  170. rating = blank_check(comment.find('span',{'role':'img'})['aria-label'].replace('顆星', ''))
  171. text = comment.find('div', class_='ODSEW-ShBeI-ShBeI-content').text
  172. created_at = comment.find('span', class_='ODSEW-ShBeI-RgZmSc-date').text
  173. photos = []
  174. c = 0
  175. for i in comment.find_all('button', class_='ODSEW-ShBeI-xJzy8c'):
  176. path = i['style'].split(';')[0].split('url')[1].replace('\"','').replace('(','').replace(')','')
  177. photos += [path]
  178. c += 1
  179. reviews += [{
  180. 'id': comment.find('a')['href'].split('/')[5],
  181. 'author_name': author_name,
  182. 'profile_photo_url': profile_photo_url,
  183. 'rating': int(rating),
  184. 'text': text,
  185. 'created_at': created_at,
  186. 'photos': photos
  187. }]
  188. count += 1
  189. output['reviews'] = reviews
  190. driver.back()
  191. return output
  192. def main():
  193. data = pd.read_csv('lat_long_location.csv', index_col = 0)
  194. tmp = data.iloc[10]
  195. latitude = tmp['latitude'] #緯度
  196. longitude = tmp['longitude'] #精度
  197. url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude,longitude)
  198. # driver = serive_create('Profile 1')
  199. driver = brower_start()
  200. driver.get(url)
  201. keyin_keyword(driver, '燒烤')
  202. result = []
  203. for id_ in range(1, 16):
  204. element = driver.find_element_by_xpath('//*[@id="sGi9mc-m5SR9c-bottom-pane"]/div/div[1]/div/div/div/div[1]/div[2]/div[{}]'.format(id_))
  205. driver.implicitly_wait(20)
  206. ActionChains(driver).move_to_element(element).click(element).perform()
  207. time_check = open_time(driver)
  208. if time_check == 1:
  209. shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  210. output = {
  211. 'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text)
  212. }
  213. print(output['name'])
  214. output = get_shop_info(driver, output, shop_soup)
  215. # print('intro')
  216. output = get_intro_info(driver, output)
  217. time.sleep(2)
  218. # print('time')
  219. output = get_time_list(shop_soup, output)
  220. # print('reviews')
  221. output = get_reviews(driver, output)
  222. output_name = output['name'].replace('(','').replace(')', '')
  223. output['google_url'] = 'https://www.google.com.tw/search?q={}+{}'.format(output['name'],output['addr'])
  224. time.sleep(2)
  225. result += [output]
  226. with open('result/20211203.json', 'w') as f:
  227. json.dump(result, f)
  228. time.sleep(2)
  229. if __name__ == '__main__':
  230. main()