utility.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. from bs4 import BeautifulSoup
  2. import pandas as pd
  3. from selenium.webdriver.common.by import By
  4. from selenium import webdriver
  5. from selenium.webdriver.common.action_chains import ActionChains
  6. from selenium.webdriver.common.keys import Keys
  7. import time
  8. def brower_start(port, proxyport, chrome_window):
  9. print(proxyport)
  10. options = webdriver.ChromeOptions()
  11. if chrome_window:
  12. browser = webdriver.Chrome(
  13. desired_capabilities=options.to_capabilities()
  14. )
  15. else:
  16. chrome_options = webdriver.ChromeOptions()
  17. chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here
  18. chrome_options.add_argument('--ignore-certificate-errors')
  19. chrome_options.add_argument("--no-sandbox")
  20. chrome_options.add_argument("--disable-dev-shm-usage")
  21. browser = webdriver.Remote(
  22. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  23. desired_capabilities=chrome_options.to_capabilities(),
  24. seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
  25. )
  26. # seleniumwire_options = {'addr': '172.17.0.2','port':4444})
  27. browser.set_window_size(1400,1000)
  28. return browse
  29. def brower_start2(port):
  30. option = webdriver.ChromeOptions()
  31. option.add_argument('--disable-web-security')
  32. option.add_argument('--allow-running-insecure-content')
  33. driver = webdriver.Chrome(options=option)
  34. executor_url = driver.command_executor._url
  35. session_id = driver.session_id
  36. print (session_id)
  37. print (executor_url)
  38. time.sleep(3)
  39. return driver
  40. def serive_create():
  41. option = webdriver.ChromeOptions()
  42. option.add_argument('--disable-web-security')
  43. option.add_argument('--allow-running-insecure-content')
  44. # option.add_argument("--user-data-dir=//Users//noodles//Documents//project")
  45. # option.add_argument("profile-directory="+profilepath)
  46. driver = webdriver.Chrome('../../driver/chromedriver_win32/chromedriver', options=option)
  47. executor_url = driver.command_executor._url
  48. session_id = driver.session_id
  49. print (session_id)
  50. print (executor_url)
  51. time.sleep(3)
  52. return driver
  53. def string_check(x):
  54. return x.rstrip().lstrip()
  55. def get_content_info(driver):
  56. shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  57. post_info = shop_soup.select("a.app-aware-link div.update-components-actor__meta ")[0]
  58. post_name = post_info.find('span', class_='t-bold').text
  59. post_name = string_check(post_name)
  60. post_position = post_info.find('span', class_='t-black--light').text
  61. post_position = string_check(post_position)
  62. print(post_name, ';', post_position)
  63. content = shop_soup.find('div',class_='feed-shared-update-v2__description-wrapper').select("span[dir='ltr']")[0].text
  64. print(content)
  65. try:
  66. content_url = shop_soup.select('div.update-components-article__link-container')[0].find('a').get('href')
  67. except:
  68. content_url = ''
  69. return {
  70. 'post_name': post_name,
  71. 'post_position':post_position,
  72. 'content':content,
  73. 'content_url':content_url
  74. }
  75. def linkedin_login(driver, config, user_choose='person2'):
  76. user = config[user_choose]['user']
  77. passwd = config[user_choose]['passwd']
  78. user_button = driver.find_element(By.ID, "username")
  79. driver.implicitly_wait(30)
  80. ActionChains(driver).move_to_element(user_button).click(user_button).send_keys(user).perform()
  81. # time.sleep(3)
  82. passwd_button = driver.find_element(By.ID, "password")
  83. driver.implicitly_wait(30)
  84. ActionChains(driver).move_to_element(passwd_button).click(passwd_button).send_keys(passwd).send_keys(Keys.ENTER).perform()
  85. # time.sleep(1)
  86. def check_duplicate(table_name, column, db):
  87. result = db.query(f'SELECT {column} FROM {table_name}')
  88. result = pd.DataFrame([dict(i) for i in result])
  89. return result[column].to_list()
  90. def check_page(driver):
  91. soup = BeautifulSoup(driver.page_source, 'html.parser')
  92. try:
  93. if soup.find('h2', class_='headline-new').text.find('我們無法聯絡到您') != -1:
  94. print('email error')
  95. ignore_button = driver.find_element(By.CSS_SELECTOR, "button.secondary-action-new")
  96. driver.implicitly_wait(30)
  97. ActionChains(driver).move_to_element(ignore_button).click(ignore_button).perform()
  98. except:
  99. pass