123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101 |
- from bs4 import BeautifulSoup
- import pandas as pd
- from selenium.webdriver.common.by import By
- from selenium import webdriver
- from selenium.webdriver.common.action_chains import ActionChains
- from selenium.webdriver.common.keys import Keys
- import time
- def brower_start(port):
- options = webdriver.ChromeOptions()
- browser = webdriver.Remote(
- command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
- desired_capabilities=options.to_capabilities()
- )
- return browser
- def serive_create():
- option = webdriver.ChromeOptions()
- option.add_argument('--disable-web-security')
- option.add_argument('--allow-running-insecure-content')
- # option.add_argument("--user-data-dir=//Users//noodles//Documents//project")
- # option.add_argument("profile-directory="+profilepath)
- driver = webdriver.Chrome('../../driver/chromedriver_20230202/chromedriver', options=option)
- executor_url = driver.command_executor._url
- session_id = driver.session_id
- print (session_id)
- print (executor_url)
- time.sleep(3)
-
- return driver
- def string_check(x):
- return x.rstrip().lstrip()
- def get_content_info(driver):
- shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
-
- post_info = shop_soup.select("a.app-aware-link div.update-components-actor__meta ")[0]
- post_name = post_info.find('span', class_='t-bold').text
- post_name = string_check(post_name)
- post_position = post_info.find('span', class_='t-black--light').text
- post_position = string_check(post_position)
- print(post_name, ';', post_position)
-
- content = shop_soup.find('div',class_='feed-shared-update-v2__description-wrapper').select("span[dir='ltr']")[0].text
- print(content)
-
- try:
- content_url = shop_soup.select('div.update-components-article__link-container')[0].find('a').get('href')
- except:
- content_url = ''
-
- return {
- 'post_name': post_name,
- 'post_position':post_position,
- 'content':content,
- 'content_url':content_url
- }
- def linkedin_login(driver, config, user_choose='person2'):
- user = config[user_choose]['user']
- passwd = config[user_choose]['passwd']
-
- user_button = driver.find_element(By.ID, "username")
- driver.implicitly_wait(30)
- ActionChains(driver).move_to_element(user_button).click(user_button).send_keys(user).perform()
- # time.sleep(3)
-
- passwd_button = driver.find_element(By.ID, "password")
- driver.implicitly_wait(30)
- ActionChains(driver).move_to_element(passwd_button).click(passwd_button).send_keys(passwd).send_keys(Keys.ENTER).perform()
- # time.sleep(1)
- def check_duplicate(table_name, column, db):
- result = db.query(f'SELECT {column} FROM {table_name}')
- result = pd.DataFrame([dict(i) for i in result])
-
- return result[column].to_list()
- def check_page(driver):
- soup = BeautifulSoup(driver.page_source, 'html.parser')
- try:
- if soup.find('h2', class_='headline-new').text.find('我們無法聯絡到您') != -1:
- print('email error')
- ignore_button = driver.find_element(By.CSS_SELECTOR, "button.secondary-action-new")
- driver.implicitly_wait(30)
- ActionChains(driver).move_to_element(ignore_button).click(ignore_button).perform()
- except:
- pass
|