run.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.common.exceptions import NoSuchElementException
  8. from selenium.webdriver.common.by import By
  9. from bs4 import BeautifulSoup
  10. from utility import database_access as DA
  11. from utility.parseutils import *
  12. from utility.connect import *
  13. from datetime import datetime
  14. import pandas as pd
  15. import time
  16. import json
  17. import re
  18. # import pyautogui as pag
  19. def serive_create(profilepath):
  20. option = webdriver.ChromeOptions()
  21. option.add_argument('--disable-web-security')
  22. option.add_argument('--allow-running-insecure-content')
  23. option.add_argument("--user-data-dir=C:\\Users\\noodles\\AppData\\Local\\Google\\Chrome\\User Data")
  24. option.add_argument("profile-directory="+profilepath)
  25. driver = webdriver.Chrome('./utility/chromedriver_20211103/chromedriver', options=option)
  26. executor_url = driver.command_executor._url
  27. session_id = driver.session_id
  28. print (session_id)
  29. print (executor_url)
  30. time.sleep(3)
  31. return driver
  32. def brower_start():
  33. options = webdriver.ChromeOptions()
  34. browser = webdriver.Remote(
  35. command_executor='http://192.53.174.202:4444/wd/hub',
  36. desired_capabilities=options.to_capabilities()
  37. )
  38. return browser
  39. def keyin_keyword(driver, keyword):
  40. button = driver.find_element_by_id("searchbox")
  41. driver.implicitly_wait(30)
  42. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  43. time.sleep(3)
  44. # element = driver.find_element_by_class_name("V0h1Ob-haAclf")
  45. # driver.implicitly_wait(30)
  46. # ActionChains(driver).move_to_element(element).click(element).perform()
  47. def open_time(driver):
  48. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
  49. if element.text.find('預訂') == -1:
  50. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[9]/div[2]')
  51. driver.implicitly_wait(20)
  52. ActionChains(driver).move_to_element(element).click(element).perform()
  53. return 1
  54. else:
  55. return 0
  56. def get_shop_info(driver, output, shop_soup):
  57. current_url_split = driver.current_url.split('@')[1].split(',')
  58. output['lon'] = current_url_split[1]
  59. output['lat'] = current_url_split[0]
  60. location = shop_soup.find('button',{'data-item-id':'oloc'})['aria-label'].split(' ')
  61. output['city'] = location[-1]
  62. output['area'] = location[-2]
  63. output['addr'] = shop_soup.find('button',{'data-item-id':'address'})['aria-label'].replace('地址:', '')
  64. output['tel'] = blank_check(shop_soup.find('button',{'data-tooltip':'複製電話號碼'})['aria-label'].split(':')[1])
  65. print(output['addr'], ', ' ,output['tel'])
  66. for key in element_list:
  67. element = element_list[key]
  68. if len(element) == 3:
  69. value = shop_soup.find(element[0],element[1])[element[2]]
  70. else:
  71. tmp_value = shop_soup.find(element[0],element[1])
  72. if tmp_value:
  73. value = tmp_value.text
  74. else:
  75. value = ''
  76. output[key] = value_check(key, value)
  77. return output
  78. def get_intro_info(driver, output):
  79. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[6]')
  80. driver.implicitly_wait(20)
  81. ActionChains(driver).move_to_element(element).click(element).perform()
  82. for i in range(5, 35, 3):
  83. try:
  84. element = driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[{}]'.format(i))
  85. actions = ActionChains(driver)
  86. actions.move_to_element(element).perform()
  87. except:
  88. break
  89. intro_soup = BeautifulSoup(driver.page_source, 'html.parser')
  90. for key in intro_list:
  91. elements = intro_soup.find('div',{'aria-label':key})
  92. if elements:
  93. element = elements.find_all('li',{'class':'LQjNnc-p83tee-JNdkSc-ibnC6b'})
  94. # print(element)
  95. count = 0
  96. tmp = []
  97. for ele in element:
  98. if ele.find('img',{'src':"//www.gstatic.com/images/icons/material/system_gm/2x/check_black_18dp.png"}):
  99. tmp += [{
  100. 'id':count,
  101. intro_list[key][1]: blank_check(ele.text)
  102. }]
  103. count += 1
  104. print(str(tmp))
  105. output[intro_list[key][0]] = str(tmp)
  106. else:
  107. output[intro_list[key][0]] = '[]'
  108. driver.back()
  109. return output
  110. def get_time_list(shop_soup, output):
  111. periods = []
  112. weekday_text = []
  113. open_now = blank_check(shop_soup.find('span', {'class':'LJKBpe-Tswv1b-hour-text'}).text.split('\xa0')[0])
  114. if open_now == '永久停業' or open_now == '暫時關閉':
  115. output['open_now'] = 'False'
  116. else:
  117. output['open_now'] = 'True'
  118. for tr_ in shop_soup.find_all('tr'):
  119. if tr_.find('div').text.replace(' ','') != '':
  120. week = tr_.find('div').text
  121. time_list = [blank_check(i.text) for i in tr_.find_all('li')]
  122. for time_ in time_list:
  123. if time_ == '24 小時營業':
  124. periods += [{
  125. "open":{
  126. "day": week_list[week],
  127. "time": 0000
  128. },
  129. "close":{
  130. "day": week_list[week],
  131. "time": ''
  132. }
  133. }]
  134. elif time_ == '休息':
  135. periods += [{
  136. "open":{
  137. "day": week_list[week],
  138. "time": ''
  139. },
  140. "close":{
  141. "day": week_list[week],
  142. "time": ''
  143. }
  144. }]
  145. else:
  146. start, end = time_.split('–')
  147. end_hour, end_min = end.split(':')
  148. start_hour, start_min = start.split(':')
  149. if end_hour < start_hour:
  150. end_day = week_list[week] + 1
  151. else:
  152. end_day = week_list[week]
  153. periods += [{
  154. "open":{
  155. "day": week_list[week],
  156. "time": start.replace(':','')
  157. },
  158. "close":{
  159. "day": end_day,
  160. "time": end.replace(':','')
  161. }
  162. }]
  163. weekday_text += ["{}: {}".format(week, ', '.join(time_list))]
  164. output['periods'] = str(periods)
  165. output['weekday_text'] = str(weekday_text)
  166. return output
  167. def get_reviews(driver, output):
  168. wait = WebDriverWait(driver, 30)
  169. more_reviews_css = "button[jsaction='pane.rating.moreReviews']"
  170. wait.until(
  171. EC.element_to_be_clickable((By.CSS_SELECTOR, more_reviews_css))
  172. )
  173. element = driver.find_element_by_css_selector(more_reviews_css)
  174. driver.implicitly_wait(20)
  175. ActionChains(driver).move_to_element(element).click(element).perform()
  176. time.sleep(2)
  177. all_photo = driver.find_elements_by_class_name('ODSEW-ShBeI-xJzy8c-bF1uUb')
  178. for ap in all_photo:
  179. ap.click()
  180. all_review = driver.find_elements_by_css_selector('button[aria-label="顯示更多"')
  181. for ap in all_review:
  182. ap.click()
  183. comment_soup = BeautifulSoup(driver.page_source, 'html.parser')
  184. count = 0
  185. reviews = []
  186. for comment in comment_soup.find_all('div',{'class':'ODSEW-ShBeI'}):
  187. comment_a_tag = comment.find_all('a')
  188. author_name = blank_check(comment_a_tag[1].find('div', class_= 'ODSEW-ShBeI-title').text)
  189. profile_photo_url = comment_a_tag[0].find('img')['src']
  190. rating = blank_check(comment.find('span',{'role':'img'})['aria-label'].replace('顆星', ''))
  191. text = comment.find('div', class_='ODSEW-ShBeI-ShBeI-content').text
  192. created_at = comment.find('span', class_='ODSEW-ShBeI-RgZmSc-date').text
  193. photos = []
  194. c = 0
  195. for i in comment.find_all('button', class_='ODSEW-ShBeI-xJzy8c'):
  196. path = i['style'].split(';')[0].split('url')[1].replace('\"','').replace('(','').replace(')','')
  197. photos += [path]
  198. c += 1
  199. reviews += [{
  200. 'id': comment.find('a')['href'].split('/')[5],
  201. 'author_name': author_name,
  202. 'profile_photo_url': profile_photo_url,
  203. 'rating': int(rating),
  204. 'text': text,
  205. 'created_at': created_at,
  206. 'photos': photos
  207. }]
  208. count += 1
  209. output['reviews'] = str(reviews)
  210. driver.back()
  211. return output
  212. # def get_photo(output, shop_soup):
  213. # shop_photo = {}
  214. # for i in shop_soup.find('div',{'aria-label':'{}的相片'.format(output['name'])}).find_all('button'):
  215. # try:
  216. # if i['aria-label'] == '街景服務和 360 度相片' or i['aria-label'] == '影片':
  217. # continue
  218. # shop_photo[i['aria-label']] = i.find('img')['src']
  219. # except:
  220. # pass
  221. # output['shop_photo'] = shop_photo
  222. # return output
  223. def find_photo_list(driver):
  224. time.sleep(2)
  225. wait = WebDriverWait(driver, 60)
  226. wait.until(
  227. EC.element_to_be_clickable((By.XPATH, '//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[1]/div/a'))
  228. )
  229. count_list = []
  230. for i in range(1, 6):
  231. try:
  232. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[3]/div[1]/div[{}]/div/a'.format(i))
  233. count_list += [element.get_attribute('data-photo-index')]
  234. actions = ActionChains(driver)
  235. actions.move_to_element(element).perform()
  236. except:
  237. break
  238. photo_soup = BeautifulSoup(driver.page_source, 'html.parser')
  239. photo_url = []
  240. for photo_id in count_list:
  241. for i in photo_soup.select('a[data-photo-index="{}"]'.format(photo_id))[0].find_all('div'):
  242. if i['style'].find('width') != -1:
  243. sentence = i['style']
  244. photo = re.search(r'https:(.*)\"', sentence)
  245. print(sentence)
  246. photo_url += [photo.group(0).replace('\"','')]
  247. break
  248. return photo_url
  249. def find_big_photo(output, driver):
  250. element = driver.find_element(By.CSS_SELECTOR, "div[aria-label='{}的相片']".format(output['name']))
  251. ActionChains(driver).move_to_element(element).click(element).perform()
  252. photo_map = {
  253. '全部': 'shop_photo',
  254. '菜單': 'menu_photo'
  255. }
  256. tab_dict = {}
  257. for tab_index in [0, 1, 2]:
  258. photo_name = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)).text
  259. if photo_name == '菜單':
  260. tab_dict[photo_name] = tab_index
  261. elif photo_name == '全部':
  262. tab_dict[photo_name] = tab_index
  263. print(tab_dict)
  264. for tab_ in tab_dict:
  265. tab_index = tab_dict[tab_]
  266. print(tab_index)
  267. wait = WebDriverWait(driver, 60)
  268. wait.until(
  269. EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index)))
  270. )
  271. element = driver.find_element(By.CSS_SELECTOR, "button[data-tab-index='{}']".format(tab_index))
  272. ActionChains(driver).move_to_element(element).click(element).perform()
  273. photo_list = find_photo_list(driver)
  274. output[photo_map[tab_]] = str(photo_list)
  275. return output
  276. def get_url_list(driver):
  277. # wait = WebDriverWait(driver, 10)
  278. # wait.until(
  279. # EC.element_to_be_clickable((By.XPATH, '//*[@id="sGi9mc-m5SR9c-bottom-pane"]/div/div[1]/div/div/div/div[1]/div[2]/div[2]'))
  280. # )
  281. # driver.back()
  282. time.sleep(2)
  283. for i in range(5, 43, 2):
  284. driver.find_element(By.XPATH,'//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[{}]/div/a'.format(i)).send_keys(Keys.DOWN)
  285. url_soup = BeautifulSoup(driver.page_source, 'html.parser')
  286. url_list = []
  287. for i in url_soup.find_all('a'):
  288. try:
  289. if i['href'].find('maps/place') != -1:
  290. url_list += [[i['href'], i['aria-label']]]
  291. except:
  292. pass
  293. return url_list
  294. def data_select_insert(db, table_name, table_col, data):
  295. tmp = []
  296. for name_ in table_col:
  297. if name_ == 'crawler_date':
  298. continue
  299. tmp += [data[name_]]
  300. tmp += [datetime.today().strftime("%Y/%m/%d %H:%M")]
  301. insert_sql = """INSERT IGNORE INTO {}{} VALUES {}"""\
  302. .format(table_name, str(tuple(table_col)).replace('\'',''), tuple(tmp))
  303. DA.mysql_insert_data(db, insert_sql)
  304. def time_click(driver):
  305. status = ''
  306. try:
  307. time_css = "span[aria-label='顯示本週營業時間']"
  308. element = driver.find_element_by_css_selector(time_css)
  309. driver.implicitly_wait(30)
  310. ActionChains(driver).move_to_element(element).click(element).perform()
  311. status = '正常'
  312. except NoSuchElementException:
  313. time_css = "div[aria-expanded='false']"
  314. elem = driver.find_element_by_css_selector(time_css)
  315. if elem:
  316. status = '暫時關閉'
  317. return status
  318. def main():
  319. data = pd.read_csv('lat_long_location.csv', index_col = 0)
  320. tmp = data.iloc[0]
  321. latitude = tmp['latitude'] #緯度
  322. longitude = tmp['longitude'] #精度
  323. url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude,longitude)
  324. # driver = serive_create('Profile 1')
  325. db = DA.mysql_connect(MYSQL_CONFIG, DB_NAME)
  326. print('drvier start...')
  327. driver = brower_start()
  328. driver.get(url)
  329. keyin_keyword(driver, '咖啡')
  330. url_list = get_url_list(driver)
  331. result = []
  332. # try:
  333. for item_url, name in url_list:
  334. print(name, ': ' ,item_url)
  335. driver.get(item_url)
  336. for i in range(4, 26, 2):
  337. element = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[{}]'.format(i))
  338. actions = ActionChains(driver)
  339. actions.move_to_element(element).perform()
  340. time_status = time_click(driver)
  341. time.sleep(1)
  342. shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  343. output = {
  344. 'name': blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text)
  345. }
  346. print(output['name'])
  347. output = get_shop_info(driver, output, shop_soup)
  348. output = get_intro_info(driver, output)
  349. output = get_time_list(shop_soup, output)
  350. output = get_reviews(driver, output)
  351. output = find_big_photo(output, driver)
  352. output_name = output['name'].replace('(','').replace(')', '')
  353. query_name = '{}+{}'.format(output_name, output['addr'])
  354. query_name = query_name.replace(' ','')
  355. output['query_name'] = query_name
  356. output['google_url'] = 'https://www.google.com.tw/search?q={}'.format(query_name)
  357. time.sleep(2)
  358. result += [output]
  359. with open('result/20211207_{}.json'.format(name), 'w') as f:
  360. json.dump(output, f)
  361. data_select_insert(db, SHOP_LIST_TABLE, SHOP_LIST_TABLE_COL, output)
  362. break
  363. # except:
  364. # shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  365. # print("error {}".format(id_))
  366. # print(blank_check(shop_soup.find('h1', class_='x3AX1-LfntMc-header-title-title').text))
  367. if __name__ == '__main__':
  368. main()