utility.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. from bs4 import BeautifulSoup
  2. import pandas as pd
  3. from selenium.webdriver.common.by import By
  4. from selenium import webdriver
  5. from selenium.webdriver.common.action_chains import ActionChains
  6. from selenium.webdriver.common.keys import Keys
  7. import time
  8. def brower_start(port):
  9. options = webdriver.ChromeOptions()
  10. browser = webdriver.Remote(
  11. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  12. desired_capabilities=options.to_capabilities()
  13. )
  14. return browser
  15. def serive_create():
  16. option = webdriver.ChromeOptions()
  17. option.add_argument('--disable-web-security')
  18. option.add_argument('--allow-running-insecure-content')
  19. # option.add_argument("--user-data-dir=//Users//noodles//Documents//project")
  20. # option.add_argument("profile-directory="+profilepath)
  21. driver = webdriver.Chrome('../../driver/chromedriver_20230202/chromedriver', options=option)
  22. executor_url = driver.command_executor._url
  23. session_id = driver.session_id
  24. print (session_id)
  25. print (executor_url)
  26. time.sleep(3)
  27. return driver
  28. def string_check(x):
  29. return x.rstrip().lstrip()
  30. def get_content_info(driver):
  31. shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  32. post_info = shop_soup.select("a.app-aware-link div.update-components-actor__meta ")[0]
  33. post_name = post_info.find('span', class_='t-bold').text
  34. post_name = string_check(post_name)
  35. post_position = post_info.find('span', class_='t-black--light').text
  36. post_position = string_check(post_position)
  37. print(post_name, ';', post_position)
  38. content = shop_soup.find('div',class_='feed-shared-update-v2__description-wrapper').select("span[dir='ltr']")[0].text
  39. print(content)
  40. try:
  41. content_url = shop_soup.select('div.update-components-article__link-container')[0].find('a').get('href')
  42. except:
  43. content_url = ''
  44. return {
  45. 'post_name': post_name,
  46. 'post_position':post_position,
  47. 'content':content,
  48. 'content_url':content_url
  49. }
  50. def linkedin_login(driver, config, user_choose='person2'):
  51. user = config[user_choose]['user']
  52. passwd = config[user_choose]['passwd']
  53. user_button = driver.find_element(By.ID, "username")
  54. driver.implicitly_wait(30)
  55. ActionChains(driver).move_to_element(user_button).click(user_button).send_keys(user).perform()
  56. # time.sleep(3)
  57. passwd_button = driver.find_element(By.ID, "password")
  58. driver.implicitly_wait(30)
  59. ActionChains(driver).move_to_element(passwd_button).click(passwd_button).send_keys(passwd).send_keys(Keys.ENTER).perform()
  60. # time.sleep(1)
  61. def check_duplicate(table_name, column, db):
  62. result = db.query(f'SELECT {column} FROM {table_name}')
  63. result = pd.DataFrame([dict(i) for i in result])
  64. return result[column].to_list()
  65. def check_page(driver):
  66. soup = BeautifulSoup(driver.page_source, 'html.parser')
  67. try:
  68. if soup.find('h2', class_='headline-new').text.find('我們無法聯絡到您') != -1:
  69. print('email error')
  70. ignore_button = driver.find_element(By.CSS_SELECTOR, "button.secondary-action-new")
  71. driver.implicitly_wait(30)
  72. ActionChains(driver).move_to_element(ignore_button).click(ignore_button).perform()
  73. except:
  74. pass