utility.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. from bs4 import BeautifulSoup
  2. import pandas as pd
  3. from selenium.webdriver.common.by import By
  4. from selenium import webdriver
  5. from selenium.webdriver.common.action_chains import ActionChains
  6. from selenium.webdriver.common.keys import Keys
  7. #from seleniumwire import webdriver
  8. #from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  9. import time
  10. def brower_start(port):
  11. options = webdriver.ChromeOptions()
  12. options.add_argument("--no-sandbox")
  13. options.add_argument("--disable-dev-shm-usage")
  14. # browser = webdriver.Chrome(options=options)
  15. # 上面成功再來用docker
  16. browser = webdriver.Remote(
  17. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  18. desired_capabilities=options.to_capabilities(),
  19. options=options
  20. )
  21. return browser
  22. def brower_start2(port):
  23. option = webdriver.ChromeOptions()
  24. option.add_argument('--disable-web-security')
  25. option.add_argument('--allow-running-insecure-content')
  26. driver = webdriver.Chrome(options=option)
  27. executor_url = driver.command_executor._url
  28. session_id = driver.session_id
  29. print (session_id)
  30. print (executor_url)
  31. time.sleep(3)
  32. return driver
  33. def serive_create():
  34. option = webdriver.ChromeOptions()
  35. option.add_argument('--disable-web-security')
  36. option.add_argument('--allow-running-insecure-content')
  37. # option.add_argument("--user-data-dir=//Users//noodles//Documents//project")
  38. # option.add_argument("profile-directory="+profilepath)
  39. driver = webdriver.Chrome('../../driver/chromedriver_win32/chromedriver', options=option)
  40. executor_url = driver.command_executor._url
  41. session_id = driver.session_id
  42. print (session_id)
  43. print (executor_url)
  44. time.sleep(3)
  45. return driver
  46. def string_check(x):
  47. return x.rstrip().lstrip()
  48. def get_content_info(driver):
  49. shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  50. post_info = shop_soup.select("a.app-aware-link div.update-components-actor__meta ")[0]
  51. post_name = post_info.find('span', class_='t-bold').text
  52. post_name = string_check(post_name)
  53. post_position = post_info.find('span', class_='t-black--light').text
  54. post_position = string_check(post_position)
  55. print(post_name, ';', post_position)
  56. content = shop_soup.find('div',class_='feed-shared-update-v2__description-wrapper').select("span[dir='ltr']")[0].text
  57. print(content)
  58. try:
  59. content_url = shop_soup.select('div.update-components-article__link-container')[0].find('a').get('href')
  60. except:
  61. content_url = ''
  62. return {
  63. 'post_name': post_name,
  64. 'post_position':post_position,
  65. 'content':content,
  66. 'content_url':content_url
  67. }
  68. def linkedin_login(driver, config, user_choose='person2'):
  69. user = config[user_choose]['user']
  70. passwd = config[user_choose]['passwd']
  71. user_button = driver.find_element(By.ID, "username")
  72. driver.implicitly_wait(30)
  73. ActionChains(driver).move_to_element(user_button).click(user_button).send_keys(user).perform()
  74. # time.sleep(3)
  75. passwd_button = driver.find_element(By.ID, "password")
  76. driver.implicitly_wait(30)
  77. ActionChains(driver).move_to_element(passwd_button).click(passwd_button).send_keys(passwd).send_keys(Keys.ENTER).perform()
  78. # time.sleep(1)
  79. def check_duplicate(table_name, column, db):
  80. result = db.query(f'SELECT {column} FROM {table_name}')
  81. result = pd.DataFrame([dict(i) for i in result])
  82. return result[column].to_list()
  83. def check_page(driver):
  84. soup = BeautifulSoup(driver.page_source, 'html.parser')
  85. try:
  86. if soup.find('h2', class_='headline-new').text.find('我們無法聯絡到您') != -1:
  87. print('email error')
  88. ignore_button = driver.find_element(By.CSS_SELECTOR, "button.secondary-action-new")
  89. driver.implicitly_wait(30)
  90. ActionChains(driver).move_to_element(ignore_button).click(ignore_button).perform()
  91. except:
  92. pass