from bs4 import BeautifulSoup import pandas as pd from selenium.webdriver.common.by import By from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.keys import Keys #from seleniumwire import webdriver #from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import time def brower_start(port): options = webdriver.ChromeOptions() options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") # browser = webdriver.Chrome(options=options) # 上面成功再來用docker browser = webdriver.Remote( command_executor='http://127.0.0.1:'+str(port)+'/wd/hub', desired_capabilities=options.to_capabilities(), options=options ) return browser def brower_start2(port): option = webdriver.ChromeOptions() option.add_argument('--disable-web-security') option.add_argument('--allow-running-insecure-content') driver = webdriver.Chrome(options=option) executor_url = driver.command_executor._url session_id = driver.session_id print (session_id) print (executor_url) time.sleep(3) return driver def serive_create(): option = webdriver.ChromeOptions() option.add_argument('--disable-web-security') option.add_argument('--allow-running-insecure-content') # option.add_argument("--user-data-dir=//Users//noodles//Documents//project") # option.add_argument("profile-directory="+profilepath) driver = webdriver.Chrome('../../driver/chromedriver_win32/chromedriver', options=option) executor_url = driver.command_executor._url session_id = driver.session_id print (session_id) print (executor_url) time.sleep(3) return driver def string_check(x): return x.rstrip().lstrip() def get_content_info(driver): shop_soup = BeautifulSoup(driver.page_source, 'html.parser') post_info = shop_soup.select("a.app-aware-link div.update-components-actor__meta ")[0] post_name = post_info.find('span', class_='t-bold').text post_name = string_check(post_name) post_position = post_info.find('span', class_='t-black--light').text post_position = string_check(post_position) print(post_name, ';', post_position) content = shop_soup.find('div',class_='feed-shared-update-v2__description-wrapper').select("span[dir='ltr']")[0].text print(content) try: content_url = shop_soup.select('div.update-components-article__link-container')[0].find('a').get('href') except: content_url = '' return { 'post_name': post_name, 'post_position':post_position, 'content':content, 'content_url':content_url } def linkedin_login(driver, config, user_choose='person2'): user = config[user_choose]['user'] passwd = config[user_choose]['passwd'] user_button = driver.find_element(By.ID, "username") driver.implicitly_wait(30) ActionChains(driver).move_to_element(user_button).click(user_button).send_keys(user).perform() # time.sleep(3) passwd_button = driver.find_element(By.ID, "password") driver.implicitly_wait(30) ActionChains(driver).move_to_element(passwd_button).click(passwd_button).send_keys(passwd).send_keys(Keys.ENTER).perform() # time.sleep(1) def check_duplicate(table_name, column, db): result = db.query(f'SELECT {column} FROM {table_name}') result = pd.DataFrame([dict(i) for i in result]) return result[column].to_list() def check_page(driver): soup = BeautifulSoup(driver.page_source, 'html.parser') try: if soup.find('h2', class_='headline-new').text.find('我們無法聯絡到您') != -1: print('email error') ignore_button = driver.find_element(By.CSS_SELECTOR, "button.secondary-action-new") driver.implicitly_wait(30) ActionChains(driver).move_to_element(ignore_button).click(ignore_button).perform() except: pass