run.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.webdriver.common.by import By
  8. from bs4 import BeautifulSoup
  9. from utility import database_access as DA
  10. from utility.parseutils import *
  11. from utility.connect import *
  12. from datetime import datetime
  13. import pandas as pd
  14. import time
  15. import json
  16. import re
  17. # import pyautogui as pag
  18. def serive_create(profilepath):
  19. option = webdriver.ChromeOptions()
  20. option.add_argument('--disable-web-security')
  21. option.add_argument('--allow-running-insecure-content')
  22. option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data")
  23. option.add_argument("profile-directory="+profilepath)
  24. driver = webdriver.Chrome('./utility/chromedriver_20211103/chromedriver', options=option)
  25. executor_url = driver.command_executor._url
  26. session_id = driver.session_id
  27. print (session_id)
  28. print (executor_url)
  29. time.sleep(3)
  30. return driver
  31. def brower_start():
  32. options = webdriver.ChromeOptions()
  33. browser = webdriver.Remote(
  34. command_executor='http://192.53.174.202:4444/wd/hub',
  35. desired_capabilities=options.to_capabilities()
  36. )
  37. return browser
  38. def keyin_keyword(driver, keyword):
  39. button = driver.find_element_by_id("searchbox")
  40. driver.implicitly_wait(30)
  41. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  42. time.sleep(3)
  43. # element = driver.find_element_by_class_name("V0h1Ob-haAclf")
  44. # driver.implicitly_wait(30)
  45. # ActionChains(driver).move_to_element(element).click(element).perform()
  46. def open_time(driver):
  47. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
  48. if element.text.find('預訂') == -1:
  49. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
  50. driver.implicitly_wait(20)
  51. ActionChains(driver).move_to_element(element).click(element).perform()
  52. return 1
  53. else:
  54. return 0
  55. def get_shop_info(driver, output, shop_soup):
  56. current_url_split = driver.current_url.split('@')[1].split(',')
  57. output['lon'] = current_url_split[1]
  58. output['lat'] = current_url_split[0]
  59. location = shop_soup.find('button',{'data-item-id':'oloc'})['aria-label'].split(' ')
  60. output['city'] = location[-1]
  61. output['area'] = location[-2]
  62. output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].replace('地址:', '')
  63. output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
  64. print(output['addr'], ', ' ,output['tel'])
  65. for key in element_list:
  66. element = element_list[key]
  67. if len(element) == 3:
  68. value = shop_soup.find(element[0],element[1])[element[2]]
  69. else:
  70. tmp_value = shop_soup.find(element[0],element[1])
  71. if tmp_value:
  72. value = tmp_value.text
  73. else:
  74. value = ''
  75. output[key] = value_check(key, value)
  76. return output
  77. def get_intro_info(driver, output):
  78. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]')
  79. driver.implicitly_wait(20)
  80. ActionChains(driver).move_to_element(element).click(element).perform()
  81. for i in range(5, 35, 3):
  82. try:
  83. element = driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[{}]'.format(i))
  84. actions = ActionChains(driver)
  85. actions.move_to_element(element).perform()
  86. except:
  87. break
  88. intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
  89. for key in intro_list:
  90. elements = intro_soup.find('div',{'aria-label':key})
  91. if elements:
  92. element = elements.find_all('li',{'class':'LQjNnc-p83tee-JNdkSc-ibnC6b'})
  93. # print(element)
  94. count = 0
  95. tmp = []
  96. for ele in element:
  97. if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/2x/check_black_18dp.png"}):
  98. tmp += [{
  99. 'id':count,
  100. intro_list[key][1]: blank_check(ele.text)
  101. }]
  102. count += 1
  103. print(str(tmp))
  104. output[intro_list[key][0]] = str(tmp)
  105. else:
  106. output[intro_list[key][0]] = '[]'
  107. driver.back()
  108. return output
  109. def get_time_list(shop_soup, output):
  110. open_now = blank_check(shop_soup.find('span', {'class':'LJKBpe-Tswv1b-hour-text'}).text.split('\xa0')[0])
  111. if open_now == '永久停業' or open_now == '暫時關閉':
  112. output['open_now'] = 'False'
  113. else:
  114. output['open_now'] = 'True'
  115. periods = []
  116. weekday_text = []
  117. for tr_ in shop_soup.find_all('tr'):
  118. if tr_.find('div').text.replace(' ','') != '':
  119. week = tr_.find('div').text
  120. time_list = [blank_check(i.text) for i in tr_.find_all('li')]
  121. for time_ in time_list:
  122. if time_ == '24 小時營業':
  123. periods += [{
  124. "open":{
  125. "day": week_list[week],
  126. "time": 0000
  127. },
  128. "close":{
  129. "day": week_list[week],
  130. "time": ''
  131. }
  132. }]
  133. elif time_ == '休息':
  134. periods += [{
  135. "open":{
  136. "day": week_list[week],
  137. "time": ''
  138. },
  139. "close":{
  140. "day": week_list[week],
  141. "time": ''
  142. }
  143. }]
  144. else:
  145. start, end = time_.split('–')
  146. end_hour, end_min = end.split(':')
  147. start_hour, start_min = start.split(':')
  148. if end_hour < start_hour:
  149. end_day = week_list[week] + 1
  150. else:
  151. end_day = week_list[week]
  152. periods += [{
  153. "open":{
  154. "day": week_list[week],
  155. "time": start.replace(':','')
  156. },
  157. "close":{
  158. "day": end_day,
  159. "time": end.replace(':','')
  160. }
  161. }]
  162. weekday_text += ["{}: {}".format(week, ', '.join(time_list))]
  163. output['periods'] = str(periods)
  164. output['weekday_text'] = str(weekday_text)
  165. return output
  166. def get_reviews(driver, output):
  167. wait = WebDriverWait(driver, 30)
  168. more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
  169. wait.until(
  170. EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
  171. )
  172. element = driver.find_element_by_css_selector(more_reviews_css)
  173. driver.implicitly_wait(20)
  174. ActionChains(driver).move_to_element(element).click(element).perform()
  175. time.sleep(2)
  176. all_photo = driver.find_elements_by_class_name('ODSEW-ShBeI-xJzy8c-bF1uUb')
  177. for ap in all_photo:
  178. ap.click()
  179. all_review = driver.find_elements_by_css_selector('button[aria-label="顯示更多"')
  180. for ap in all_review:
  181. ap.click()
  182. comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
  183. count = 0
  184. reviews = []
  185. for comment in comment_soup.find_all('div',{'class':'ODSEW-ShBeI'}):
  186. comment_a_tag = comment.find_all('a')
  187. author_name = blank_check(comment_a_tag[1].find('div', class_= 'ODSEW-ShBeI-title').text)
  188. profile_photo_url = comment_a_tag[0].find('img')['src']
  189. rating = blank_check(comment.find('span',{'role':'img'})['aria-label'].replace('顆星', ''))
  190. text = comment.find('div', class_='ODSEW-ShBeI-ShBeI-content').text
  191. created_at = comment.find('span', class_='ODSEW-ShBeI-RgZmSc-date').text
  192. photos = []
  193. c = 0
  194. for i in comment.find_all('button', class_='ODSEW-ShBeI-xJzy8c'):
  195. path = i['style'].split(';')[0].split('url')[1].replace('\"','').replace('(','').replace(')','')
  196. photos += [path]
  197. c += 1
  198. reviews += [{
  199. 'id': comment.find('a')['href'].split('/')[5],
  200. 'author_name': author_name,
  201. 'profile_photo_url': profile_photo_url,
  202. 'rating': int(rating),
  203. 'text': text,
  204. 'created_at': created_at,
  205. 'photos': photos
  206. }]
  207. count += 1
  208. output['reviews'] = str(reviews)
  209. driver.back()
  210. return output
  211. # def get_photo(output, shop_soup):
  212. # shop_photo = {}
  213. # for i in shop_soup.find('div',{'aria-label':'{}的相片'.format(output['name'])}).find_all('button'):
  214. # try:
  215. # if i['aria-label'] == '街景服務和 360 度相片' or i['aria-label'] == '影片':
  216. # continue
  217. # shop_photo[i['aria-label']] = i.find('img')['src']
  218. # except:
  219. # pass
  220. # output['shop_photo'] = shop_photo
  221. # return output
  222. def find_photo_list(driver):
  223. wait = WebDriverWait(driver, 60)
  224. wait.until(
  225. EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a'))
  226. )
  227. count_list = []
  228. for i in range(1, 6):
  229. try:
  230. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[{}]/div/a'.format(i))
  231. count_list += [element.get_attribute('data-photo-index')]
  232. actions = ActionChains(driver)
  233. actions.move_to_element(element).perform()
  234. except:
  235. break
  236. photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
  237. photo_url = []
  238. for photo_id in count_list:
  239. for i in photo_soup.select('a[data-photo-index="{}"]'.format(photo_id))[0].find_all('div'):
  240. if i['style'].find('width') != -1:
  241. sentence = i['style']
  242. photo = re.search(r'https:(.*)\"', sentence)
  243. print(sentence)
  244. photo_url += [photo.group(0).replace('\"','')]
  245. break
  246. return photo_url
  247. def find_big_photo(output, driver):
  248. element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
  249. ActionChains(driver).move_to_element(element).click(element).perform()
  250. photo_map = {
  251. '全部': 'shop_photo',
  252. '菜單': 'menu_photo'
  253. }
  254. tab_dict = {}
  255. for tab_index in [0, 1, 2]:
  256. photo_name = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)).text
  257. if photo_name == '菜單':
  258. tab_dict[photo_name] = tab_index
  259. elif photo_name == '全部':
  260. tab_dict[photo_name] = tab_index
  261. print(tab_dict)
  262. for tab_ in tab_dict:
  263. tab_index = tab_dict[tab_]
  264. print(tab_index)
  265. wait = WebDriverWait(driver, 60)
  266. wait.until(
  267. EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
  268. )
  269. element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
  270. ActionChains(driver).move_to_element(element).click(element).perform()
  271. photo_list = find_photo_list(driver)
  272. output[photo_map[tab_]] = str(photo_list)
  273. return output
  274. def get_url_list(driver):
  275. # wait = WebDriverWait(driver, 10)
  276. # wait.until(
  277. # EC.element_to_be_clickable((By.XPATH, '//*[@id="sGi9mc-m5SR9c-bottom-pane"]/div/div[1]/div/div/div/div[1]/div[2]/div[2]'))
  278. # )
  279. # driver.back()
  280. time.sleep(2)
  281. for i in range(5, 43, 2):
  282. driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
  283. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  284. url_list = []
  285. for i in url_soup.find_all('a'):
  286. try:
  287. if i['href'].find('maps/place') != -1:
  288. url_list += [[i['href'], i['aria-label']]]
  289. except:
  290. pass
  291. return url_list
  292. def data_select_insert(db, table_name, table_col, data):
  293. tmp = []
  294. for name_ in table_col:
  295. if name_ == 'crawler_date':
  296. continue
  297. tmp += [data[name_]]
  298. tmp += [datetime.today().strftime("%Y/%m/%d %H:%M")]
  299. insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
  300. .format(table_name, str(tuple(table_col)).replace('\'',''), tuple(tmp))
  301. DA.mysql_insert_data(db, insert_sql)
  302. def main():
  303. data = pd.read_csv('lat_long_location.csv', index_col = 0)
  304. tmp = data.iloc[0]
  305. latitude = tmp['latitude'] #緯度
  306. longitude = tmp['longitude'] #精度
  307. url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude,longitude)
  308. # driver = serive_create('Profile 1')
  309. db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
  310. print('drvier start...')
  311. driver = brower_start()
  312. driver.get(url)
  313. keyin_keyword(driver, '咖啡')
  314. url_list = get_url_list(driver)
  315. result = []
  316. # try:
  317. for item_url, name in url_list:
  318. print(name, ': ' ,item_url)
  319. driver.get(item_url)
  320. wait = WebDriverWait(driver, 120)
  321. time_css = "span[aria-label='顯示本週營業時間']"
  322. wait.until(
  323. EC.element_to_be_clickable((By.CSS_SELECTOR, time_css))
  324. )
  325. element = driver.find_element_by_css_selector(time_css)
  326. driver.implicitly_wait(30)
  327. ActionChains(driver).move_to_element(element).click(element).perform()
  328. time.sleep(1)
  329. shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  330. output = {
  331. 'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text)
  332. }
  333. print(output['name'])
  334. output = get_shop_info(driver, output, shop_soup)
  335. output = get_intro_info(driver, output)
  336. output = get_time_list(shop_soup, output)
  337. output = get_reviews(driver, output)
  338. output = find_big_photo(output, driver)
  339. output_name = output['name'].replace('(','').replace(')', '')
  340. query_name = '{}+{}'.format(output_name, output['addr'])
  341. query_name = query_name.replace(' ','')
  342. output['query_name'] = query_name
  343. output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
  344. time.sleep(2)
  345. result += [output]
  346. with open('result/20211207_{}.json'.format(name), 'w') as f:
  347. json.dump(output, f)
  348. data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
  349. break
  350. # except:
  351. # shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  352. # print("error {}".format(id_))
  353. # print(blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text))
  354. if __name__ == '__main__':
  355. main()