|
@@ -8,6 +8,7 @@ from selenium.webdriver.common.by import By
|
|
|
from selenium.common.exceptions import TimeoutException
|
|
|
from selenium.common.exceptions import WebDriverException
|
|
|
|
|
|
+from utility import *
|
|
|
import json
|
|
|
from datetime import datetime
|
|
|
from bs4 import BeautifulSoup
|
|
@@ -25,99 +26,6 @@ company_list_table = db['company']
|
|
|
user_list_table = db['user']
|
|
|
|
|
|
|
|
|
-def brower_start(port):
|
|
|
- options = webdriver.ChromeOptions()
|
|
|
- browser = webdriver.Remote(
|
|
|
- command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
|
|
|
- desired_capabilities=options.to_capabilities()
|
|
|
- )
|
|
|
- return browser
|
|
|
-
|
|
|
-
|
|
|
-def serive_create():
|
|
|
- option = webdriver.ChromeOptions()
|
|
|
-
|
|
|
- option.add_argument('--disable-web-security')
|
|
|
- option.add_argument('--allow-running-insecure-content')
|
|
|
-# option.add_argument("--user-data-dir=//Users//noodles//Documents//project")
|
|
|
-# option.add_argument("profile-directory="+profilepath)
|
|
|
-
|
|
|
- driver = webdriver.Chrome('../../driver/chromedriver_20230202/chromedriver', options=option)
|
|
|
- executor_url = driver.command_executor._url
|
|
|
- session_id = driver.session_id
|
|
|
- print (session_id)
|
|
|
- print (executor_url)
|
|
|
- time.sleep(3)
|
|
|
-
|
|
|
- return driver
|
|
|
-
|
|
|
-
|
|
|
-def string_check(x):
|
|
|
- return x.rstrip().lstrip()
|
|
|
-
|
|
|
-
|
|
|
-def get_content_info(driver):
|
|
|
- shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
- post_info = shop_soup.find('article').select("div[data-test-id='main-feed-activity-card__entity-lockup']")[0]
|
|
|
-
|
|
|
- post_name = post_info.find('a', class_='text-sm link-styled no-underline leading-open').text
|
|
|
- post_name = string_check(post_name)
|
|
|
-
|
|
|
- post_position = post_info.find('p').text
|
|
|
- post_position = string_check(post_position)
|
|
|
-
|
|
|
- print(post_name, ';', post_position)
|
|
|
-
|
|
|
- content = shop_soup.find('article').find('p',class_='attributed-text-segment-list__content').text
|
|
|
- print(content)
|
|
|
-
|
|
|
- try:
|
|
|
- content_url = shop_soup.select("article a[data-tracking-control-name='public_post_feed-article-content']")[0].get('href')
|
|
|
- except:
|
|
|
- content_url = ''
|
|
|
-
|
|
|
- return {
|
|
|
- 'post_name': post_name,
|
|
|
- 'post_position':post_position,
|
|
|
- 'content':content,
|
|
|
- 'content_url':content_url
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
-def linkedin_login(driver, config, user_choose='person2'):
|
|
|
- user = config[user_choose]['user']
|
|
|
- passwd = config[user_choose]['passwd']
|
|
|
-
|
|
|
- user_button = driver.find_element(By.ID, "username")
|
|
|
- driver.implicitly_wait(30)
|
|
|
- ActionChains(driver).move_to_element(user_button).click(user_button).send_keys(user).perform()
|
|
|
- # time.sleep(3)
|
|
|
-
|
|
|
- passwd_button = driver.find_element(By.ID, "password")
|
|
|
- driver.implicitly_wait(30)
|
|
|
- ActionChains(driver).move_to_element(passwd_button).click(passwd_button).send_keys(passwd).send_keys(Keys.ENTER).perform()
|
|
|
- # time.sleep(1)
|
|
|
-
|
|
|
-
|
|
|
-def check_duplicate(table_name, column):
|
|
|
- result = db.query(f'SELECT {column} FROM {table_name}')
|
|
|
- result = pd.DataFrame([dict(i) for i in result])
|
|
|
-
|
|
|
- return result[column].to_list()
|
|
|
-
|
|
|
-
|
|
|
-def check_page(driver):
|
|
|
- soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
- try:
|
|
|
- if soup.find('h2', class_='headline-new').text.find('我們無法聯絡到您') != -1:
|
|
|
- print('email error')
|
|
|
- ignore_button = driver.find_element(By.CSS_SELECTOR, "button.secondary-action-new")
|
|
|
- driver.implicitly_wait(30)
|
|
|
- ActionChains(driver).move_to_element(ignore_button).click(ignore_button).perform()
|
|
|
- except:
|
|
|
- pass
|
|
|
-
|
|
|
-
|
|
|
def show_more_result(driver, company_count):
|
|
|
for i in tqdm(range(int(company_count/25)+1)):
|
|
|
for button in driver.find_elements(By.CSS_SELECTOR,'button.scaffold-finite-scroll__load-button'):
|
|
@@ -131,11 +39,13 @@ def show_more_result(driver, company_count):
|
|
|
break
|
|
|
|
|
|
|
|
|
-def get_company_from_first_page(interest_button):
|
|
|
+def get_company_from_first_page(interest_button, company_url_list):
|
|
|
company_list = []
|
|
|
for i in interest_button.find_element(By.XPATH,"../../..").find_elements(By.CSS_SELECTOR,' div.artdeco-tabpanel.active ul.pvs-list li.artdeco-list__item'):
|
|
|
- company_name = i.find_element(By.CSS_SELECTOR,'span.t-bold span').text
|
|
|
company_url = i.find_element(By.CSS_SELECTOR,'a[data-field="active_tab_companies_interests"]').get_attribute('href')
|
|
|
+ if company_url in company_url_list:
|
|
|
+ continue
|
|
|
+ company_name = i.find_element(By.CSS_SELECTOR,'span.t-bold span').text
|
|
|
company_image = i.find_element(By.CSS_SELECTOR,'img').get_attribute('src')
|
|
|
company_followers = int(i.find_element(By.CSS_SELECTOR,'span.t-black--light span').text.replace(' followers','').replace(',',''))
|
|
|
company_list += [company_name]
|
|
@@ -151,12 +61,14 @@ def get_company_from_first_page(interest_button):
|
|
|
return company_list
|
|
|
|
|
|
|
|
|
-def get_company_from_next_page(driver):
|
|
|
+def get_company_from_next_page(driver, company_url_list):
|
|
|
shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
company_list = []
|
|
|
for item in tqdm(shop_soup.find_all('li', class_='pvs-list__paged-list-item')):
|
|
|
try:
|
|
|
company_url = item.select("a[data-field='active_tab_companies_interests']")[0]['href']
|
|
|
+ if company_url in company_url_list:
|
|
|
+ continue
|
|
|
company_name = item.find('span', class_= 't-bold').find('span').text
|
|
|
company_image = item.select('div img')[0]['src']
|
|
|
company_followers = item.find('span', 't-black--light').find('span').text
|
|
@@ -208,8 +120,8 @@ def argparse_setting():
|
|
|
p.add_argument('-l', '--limit_count', nargs='?', const=1, type=int, default=20)
|
|
|
p.add_argument('-u', '--user', nargs='?', const=1, type=str, default='person1')
|
|
|
p.add_argument('-p', '--port', nargs='?', const=1, type=int, default='4446')
|
|
|
- p.add_argument('-e', '--enviorment', nargs='?', const=1, type=str, default='windows')
|
|
|
- # p.add_argument('--add-feature-a', dest='a', action='store_true', default=False)
|
|
|
+ p.add_argument('-e', '--enviorment', nargs='?', const=1, type=str, default='windows', help="windows or linux")
|
|
|
+
|
|
|
return p
|
|
|
|
|
|
|
|
@@ -217,13 +129,16 @@ def main():
|
|
|
p = argparse_setting()
|
|
|
args = p.parse_args()
|
|
|
|
|
|
- if args.enviorment == 'winodws':
|
|
|
+ if args.enviorment == 'windows':
|
|
|
+ print('windows web start')
|
|
|
driver = serive_create()
|
|
|
else:
|
|
|
+ print('linux web start')
|
|
|
driver = brower_start(args.port)
|
|
|
url = 'https://www.linkedin.com/login'
|
|
|
driver.get(url)
|
|
|
|
|
|
+ print(f'login in with {args.user}')
|
|
|
linkedin_login(driver, config, user_choose=args.user)
|
|
|
time.sleep(2)
|
|
|
check_page(driver)
|
|
@@ -232,8 +147,9 @@ def main():
|
|
|
result = pd.DataFrame([dict(i) for i in result])
|
|
|
|
|
|
# try:
|
|
|
+ print('start to crawler...')
|
|
|
for k, r in result.iterrows():
|
|
|
- company_url_list = check_duplicate('company', 'company_url')
|
|
|
+ company_url_list = check_duplicate('company', 'company_url', db)
|
|
|
|
|
|
url = r['url']
|
|
|
driver.get(url)
|
|
@@ -242,7 +158,7 @@ def main():
|
|
|
print(f'company_count: {company_count}')
|
|
|
|
|
|
if company_count == '':
|
|
|
- company_list = get_company_from_first_page(interest_button)
|
|
|
+ company_list = get_company_from_first_page(interest_button, company_url_list)
|
|
|
|
|
|
else:
|
|
|
if company_count > 10:
|
|
@@ -251,7 +167,7 @@ def main():
|
|
|
company_count = 2000
|
|
|
show_more_result(driver, company_count)
|
|
|
time.sleep(1)
|
|
|
- company_list = get_company_from_next_page(driver)
|
|
|
+ company_list = get_company_from_next_page(driver, company_url_list)
|
|
|
|
|
|
print(len(company_list))
|
|
|
user_list_table.upsert({'url':url,'company_list':' | '.join(company_list[:2000])},['url'])
|