před 2 roky · 71dab3f65e
--- a/get_person_list.py
+++ b/get_person_list.py
@@ -0,0 +1,173 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+from selenium import webdriver
			
 
				+from selenium.webdriver.common.action_chains import ActionChains
			
 
				+from selenium.webdriver.common.keys import Keys
			
 
				+from selenium.webdriver.support import expected_conditions as EC
			
 
				+from selenium.webdriver.support.wait import WebDriverWait
			
 
				+from selenium.webdriver.common.by import By
			
 
				+from selenium.common.exceptions import TimeoutException
			
 
				+from selenium.common.exceptions import WebDriverException
			
 
				+
			
 
				+from utility import *
			
 
				+import json
			
 
				+from datetime import datetime
			
 
				+from bs4 import BeautifulSoup
			
 
				+import time
			
 
				+
			
 
				+from tqdm import tqdm
			
 
				+import pandas as pd
			
 
				+import dataset
			
 
				+import argparse
			
 
				+import configparser
			
 
				+config = configparser.ConfigParser()
			
 
				+config.read('config.ini')
			
 
				+db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/linkedin?charset=utf8mb4')
			
 
				+post_list_table = db['post2']
			
 
				+user_list_table = db['user']
			
 
				+user_url_list = check_duplicate('user', 'url', db)
			
 
				+url_list = check_duplicate('post2', 'url', db)
			
 
				+
			
 
				+
			
 
				+def get_content_info(driver):
			
 
				+    shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
			
 
				+    
			
 
				+    post_info = shop_soup.select("a.app-aware-link div.update-components-actor__meta ")[0]
			
 
				+
			
 
				+    post_name = post_info.find('span', class_='t-bold').text
			
 
				+    post_name = string_check(post_name)
			
 
				+
			
 
				+    post_position = post_info.find('span', class_='t-black--light').text
			
 
				+    post_position = string_check(post_position)
			
 
				+    print(post_name, ';', post_position)
			
 
				+    
			
 
				+    content = shop_soup.find('div',class_='feed-shared-update-v2__description-wrapper').select("span[dir='ltr']")[0].text
			
 
				+    print(content)
			
 
				+    
			
 
				+    try:
			
 
				+        content_url = shop_soup.select('div.update-components-article__link-container')[0].find('a').get('href')
			
 
				+    except:
			
 
				+        content_url = ''
			
 
				+        
			
 
				+    return {
			
 
				+        'post_name': post_name,
			
 
				+        'post_position':post_position,
			
 
				+        'content':content, 
			
 
				+        'content_url':content_url
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+
			
 
				+def get_reaction_button(driver):
			
 
				+    more_reaction_button = driver.find_element(By.CSS_SELECTOR, "button[data-jump-link-target='reactors-facepile-see-more-jump-target']")
			
 
				+    # print(more_reaction_button)
			
 
				+    driver.implicitly_wait(30)
			
 
				+    ActionChains(driver).move_to_element(more_reaction_button).click(more_reaction_button).perform()
			
 
				+    time.sleep(1)
			
 
				+
			
 
				+    try:
			
 
				+        all_reactions_num = driver.find_element(By.CSS_SELECTOR, "button[data-js-reaction-tab='ALL']").text
			
 
				+        all_reactions_num = all_reactions_num.split('\n')[-1]
			
 
				+        print(f'reactions numbers: {all_reactions_num}')
			
 
				+        
			
 
				+    except:
			
 
				+        all_reactions_num = driver.find_element(By.CSS_SELECTOR, "button[data-js-reaction-tab='LIKE']").text
			
 
				+        all_reactions_num = all_reactions_num.split('\n')[-1]
			
 
				+        print(f'reactions numbers: {all_reactions_num}')
			
 
				+
			
 
				+    all_reactions_num = int(all_reactions_num.replace(',', ''))
			
 
				+    return all_reactions_num
			
 
				+
			
 
				+
			
 
				+def show_more_result(driver, all_reactions_num):
			
 
				+    if all_reactions_num > 2000:
			
 
				+        all_reactions_num = 2000
			
 
				+
			
 
				+    for i in tqdm(range(int(int(all_reactions_num) / 10 ) + 1)):
			
 
				+        for button in driver.find_elements(By.CSS_SELECTOR,'button'):
			
 
				+            if button.text == 'Show more results':
			
 
				+                try:
			
 
				+                    button_click = button
			
 
				+                    ActionChains(driver).move_to_element(button_click).click(button_click).perform()
			
 
				+                except:
			
 
				+                    pass
			
 
				+                break
			
 
				+        time.sleep(1)   
			
 
				+
			
 
				+def argparse_setting():
			
 
				+    p = argparse.ArgumentParser()
			
 
				+    p.add_argument('-u', '--user',  nargs='?', const=1, type=str, default='person1')
			
 
				+    p.add_argument('-p', '--port',  nargs='?', const=1, type=int, default='4446')
			
 
				+    p.add_argument('-e', '--enviorment',  nargs='?', const=1, type=str, default='windows', help="windows or linux")
			
 
				+    p.add_argument('-url', '--url', required = True)
			
 
				+    # p.add_argument('--add-feature-a', dest='a', action='store_true', default=False)
			
 
				+    return p
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    p = argparse_setting()
			
 
				+    args = p.parse_args()
			
 
				+    print(args.enviorment, args.port, args.user)
			
 
				+    if args.enviorment == 'windows':
			
 
				+        print('windows web start')
			
 
				+        driver = serive_create()
			
 
				+    else:
			
 
				+        print('linux web start')
			
 
				+        driver = brower_start(args.port)
			
 
				+
			
 
				+    print(f'login in with {args.user}')
			
 
				+    url = 'https://www.linkedin.com/login'
			
 
				+    driver.get(url)
			
 
				+    linkedin_login(driver, config, user_choose=args.user)
			
 
				+    time.sleep(2)
			
 
				+    check_page(driver)
			
 
				+    time.sleep(2)
			
 
				+
			
 
				+    url = args.url
			
 
				+    print(url)
			
 
				+    
			
 
				+    # post info
			
 
				+    print('start to crawler...')
			
 
				+    for i in range(3):
			
 
				+        try:
			
 
				+            driver.get(url)
			
 
				+            time.sleep(2)
			
 
				+            post_output = get_content_info(driver)
			
 
				+            post_output['url'] = url
			
 
				+            post_output['dt'] = datetime.now()
			
 
				+
			
 
				+            if url not in url_list:
			
 
				+                print('url not in post table')
			
 
				+                post_list_table.insert(post_output)
			
 
				+
			
 
				+            all_reactions_num = get_reaction_button(driver)
			
 
				+            post_list_table.upsert({'url':url,'all_reactions_num':all_reactions_num},['url'])
			
 
				+            print('upsert success')
			
 
				+            break
			
 
				+        except:
			
 
				+            pass
			
 
				+
			
 
				+    show_more_result(driver, all_reactions_num)
			
 
				+
			
 
				+    a_list = driver.find_elements(By.CSS_SELECTOR, "ul.artdeco-list li a[rel='noopener noreferrer']")
			
 
				+    print(len(a_list))
			
 
				+
			
 
				+    # reaction_list = []
			
 
				+    print('start to insert...')
			
 
				+    for i in a_list:
			
 
				+        person_url = i.get_attribute("href")
			
 
				+        reaction_info = i.text.split('\n')
			
 
				+        person_name = reaction_info[0]
			
 
				+        person_position = reaction_info[-1]
			
 
				+        
			
 
				+        if person_url not in user_url_list:
			
 
				+            user_list_table.insert({
			
 
				+                'post_url': url, 
			
 
				+                'url': person_url,
			
 
				+                'person_name': person_name,
			
 
				+                'person_position': person_position,
			
 
				+                'dt':  datetime.now()
			
 
				+            })
			
 
				+        # reaction_list += [[url, person_url, person_name, person_position]]
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/person_interest.py
+++ b/person_interest.py
@@ -8,6 +8,7 @@ from selenium.webdriver.common.by import By
 
				 from selenium.common.exceptions import TimeoutException
			
 
				 from selenium.common.exceptions import WebDriverException
			
 
				 
			
 
				+from utility import *
			
 
				 import json
			
 
				 from datetime import datetime
			
 
				 from bs4 import BeautifulSoup
			
@@ -25,99 +26,6 @@ company_list_table = db['company']
 
				 user_list_table = db['user']
			
 
				 
			
 
				 
			
 
				-def brower_start(port):
			
 
				-    options = webdriver.ChromeOptions()
			
 
				-    browser = webdriver.Remote(
			
 
				-        command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
			
 
				-        desired_capabilities=options.to_capabilities()
			
 
				-    )
			
 
				-    return browser
			
 
				-
			
 
				-
			
 
				-def serive_create():
			
 
				-    option = webdriver.ChromeOptions()
			
 
				-
			
 
				-    option.add_argument('--disable-web-security')
			
 
				-    option.add_argument('--allow-running-insecure-content') 
			
 
				-#     option.add_argument("--user-data-dir=//Users//noodles//Documents//project")
			
 
				-#     option.add_argument("profile-directory="+profilepath)
			
 
				-
			
 
				-    driver = webdriver.Chrome('../../driver/chromedriver_20230202/chromedriver', options=option)
			
 
				-    executor_url = driver.command_executor._url
			
 
				-    session_id = driver.session_id
			
 
				-    print (session_id)
			
 
				-    print (executor_url)
			
 
				-    time.sleep(3)
			
 
				-    
			
 
				-    return driver
			
 
				-
			
 
				-
			
 
				-def string_check(x):
			
 
				-    return x.rstrip().lstrip()
			
 
				-
			
 
				-
			
 
				-def get_content_info(driver):
			
 
				-    shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
			
 
				-    post_info = shop_soup.find('article').select("div[data-test-id='main-feed-activity-card__entity-lockup']")[0]
			
 
				-    
			
 
				-    post_name = post_info.find('a', class_='text-sm link-styled no-underline leading-open').text
			
 
				-    post_name = string_check(post_name)
			
 
				-
			
 
				-    post_position = post_info.find('p').text
			
 
				-    post_position = string_check(post_position)
			
 
				-
			
 
				-    print(post_name, ';', post_position)
			
 
				-    
			
 
				-    content = shop_soup.find('article').find('p',class_='attributed-text-segment-list__content').text
			
 
				-    print(content)
			
 
				-    
			
 
				-    try:
			
 
				-        content_url = shop_soup.select("article a[data-tracking-control-name='public_post_feed-article-content']")[0].get('href')
			
 
				-    except:
			
 
				-        content_url = ''
			
 
				-        
			
 
				-    return {
			
 
				-        'post_name': post_name,
			
 
				-        'post_position':post_position,
			
 
				-        'content':content, 
			
 
				-        'content_url':content_url
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-def linkedin_login(driver, config, user_choose='person2'):
			
 
				-    user = config[user_choose]['user']
			
 
				-    passwd = config[user_choose]['passwd']
			
 
				-
			
 
				-    user_button = driver.find_element(By.ID, "username")
			
 
				-    driver.implicitly_wait(30)
			
 
				-    ActionChains(driver).move_to_element(user_button).click(user_button).send_keys(user).perform()
			
 
				-    # time.sleep(3)
			
 
				-    
			
 
				-    passwd_button = driver.find_element(By.ID, "password")
			
 
				-    driver.implicitly_wait(30)
			
 
				-    ActionChains(driver).move_to_element(passwd_button).click(passwd_button).send_keys(passwd).send_keys(Keys.ENTER).perform()
			
 
				-    # time.sleep(1)
			
 
				-
			
 
				-
			
 
				-def check_duplicate(table_name, column):
			
 
				-    result = db.query(f'SELECT {column} FROM {table_name}')
			
 
				-    result = pd.DataFrame([dict(i) for i in result])
			
 
				-    
			
 
				-    return result[column].to_list()
			
 
				-
			
 
				-
			
 
				-def check_page(driver):
			
 
				-    soup = BeautifulSoup(driver.page_source, 'html.parser')
			
 
				-    try:
			
 
				-        if soup.find('h2', class_='headline-new').text.find('我們無法聯絡到您') != -1:
			
 
				-            print('email error')
			
 
				-            ignore_button = driver.find_element(By.CSS_SELECTOR, "button.secondary-action-new")
			
 
				-            driver.implicitly_wait(30)
			
 
				-            ActionChains(driver).move_to_element(ignore_button).click(ignore_button).perform()  
			
 
				-    except:
			
 
				-        pass
			
 
				-
			
 
				-
			
 
				 def show_more_result(driver, company_count):
			
 
				     for i in tqdm(range(int(company_count/25)+1)):
			
 
				         for button in driver.find_elements(By.CSS_SELECTOR,'button.scaffold-finite-scroll__load-button'):
			
@@ -131,11 +39,13 @@ def show_more_result(driver, company_count):
 
				                 break
			
 
				 
			
 
				 
			
 
				-def get_company_from_first_page(interest_button):
			
 
				+def get_company_from_first_page(interest_button, company_url_list):
			
 
				     company_list = []
			
 
				     for i in interest_button.find_element(By.XPATH,"../../..").find_elements(By.CSS_SELECTOR,' div.artdeco-tabpanel.active ul.pvs-list li.artdeco-list__item'):
			
 
				-        company_name = i.find_element(By.CSS_SELECTOR,'span.t-bold span').text
			
 
				         company_url = i.find_element(By.CSS_SELECTOR,'a[data-field="active_tab_companies_interests"]').get_attribute('href')
			
 
				+        if company_url in company_url_list:
			
 
				+                continue
			
 
				+        company_name = i.find_element(By.CSS_SELECTOR,'span.t-bold span').text
			
 
				         company_image = i.find_element(By.CSS_SELECTOR,'img').get_attribute('src')
			
 
				         company_followers = int(i.find_element(By.CSS_SELECTOR,'span.t-black--light span').text.replace(' followers','').replace(',',''))
			
 
				         company_list += [company_name]
			
@@ -151,12 +61,14 @@ def get_company_from_first_page(interest_button):
 
				     return company_list
			
 
				 
			
 
				 
			
 
				-def get_company_from_next_page(driver):
			
 
				+def get_company_from_next_page(driver, company_url_list):
			
 
				     shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
			
 
				     company_list = []
			
 
				     for item in tqdm(shop_soup.find_all('li', class_='pvs-list__paged-list-item')):
			
 
				         try:
			
 
				             company_url = item.select("a[data-field='active_tab_companies_interests']")[0]['href']
			
 
				+            if company_url in company_url_list:
			
 
				+                continue
			
 
				             company_name = item.find('span', class_= 't-bold').find('span').text
			
 
				             company_image = item.select('div img')[0]['src']
			
 
				             company_followers = item.find('span', 't-black--light').find('span').text
			
@@ -208,8 +120,8 @@ def argparse_setting():
 
				     p.add_argument('-l', '--limit_count',  nargs='?', const=1, type=int, default=20)
			
 
				     p.add_argument('-u', '--user',  nargs='?', const=1, type=str, default='person1')
			
 
				     p.add_argument('-p', '--port',  nargs='?', const=1, type=int, default='4446')
			
 
				-    p.add_argument('-e', '--enviorment',  nargs='?', const=1, type=str, default='windows')
			
 
				-    # p.add_argument('--add-feature-a', dest='a', action='store_true', default=False)
			
 
				+    p.add_argument('-e', '--enviorment',  nargs='?', const=1, type=str, default='windows', help="windows or linux")
			
 
				+
			
 
				     return p
			
 
				 
			
 
				 
			
@@ -217,13 +129,16 @@ def main():
 
				     p = argparse_setting()
			
 
				     args = p.parse_args()
			
 
				 
			
 
				-    if args.enviorment == 'winodws':
			
 
				+    if args.enviorment == 'windows':
			
 
				+        print('windows web start')
			
 
				         driver = serive_create()
			
 
				     else:
			
 
				+        print('linux web start')
			
 
				         driver = brower_start(args.port)
			
 
				     url = 'https://www.linkedin.com/login'
			
 
				     driver.get(url)
			
 
				 
			
 
				+    print(f'login in with {args.user}')
			
 
				     linkedin_login(driver, config, user_choose=args.user)
			
 
				     time.sleep(2)
			
 
				     check_page(driver)
			
@@ -232,8 +147,9 @@ def main():
 
				     result = pd.DataFrame([dict(i) for i in result])
			
 
				 
			
 
				     # try:
			
 
				+    print('start to crawler...')
			
 
				     for k, r in result.iterrows():
			
 
				-        company_url_list = check_duplicate('company', 'company_url')
			
 
				+        company_url_list = check_duplicate('company', 'company_url', db)
			
 
				         
			
 
				         url = r['url']
			
 
				         driver.get(url)
			
@@ -242,7 +158,7 @@ def main():
 
				         print(f'company_count: {company_count}')
			
 
				         
			
 
				         if company_count == '':
			
 
				-            company_list = get_company_from_first_page(interest_button)
			
 
				+            company_list = get_company_from_first_page(interest_button, company_url_list)
			
 
				 
			
 
				         else:
			
 
				             if company_count > 10:
			
@@ -251,7 +167,7 @@ def main():
 
				                     company_count = 2000
			
 
				                 show_more_result(driver, company_count)
			
 
				                 time.sleep(1) 
			
 
				-            company_list = get_company_from_next_page(driver)
			
 
				+            company_list = get_company_from_next_page(driver, company_url_list)
			
 
				         
			
 
				         print(len(company_list))
			
 
				         user_list_table.upsert({'url':url,'company_list':' | '.join(company_list[:2000])},['url'])
			
--- a/utility.py
+++ b/utility.py
@@ -0,0 +1,101 @@
 
				+from bs4 import BeautifulSoup
			
 
				+import pandas as pd
			
 
				+from selenium.webdriver.common.by import By
			
 
				+from selenium import webdriver
			
 
				+from selenium.webdriver.common.action_chains import ActionChains
			
 
				+from selenium.webdriver.common.keys import Keys
			
 
				+import time
			
 
				+
			
 
				+
			
 
				+def brower_start(port):
			
 
				+    options = webdriver.ChromeOptions()
			
 
				+    browser = webdriver.Remote(
			
 
				+        command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
			
 
				+        desired_capabilities=options.to_capabilities()
			
 
				+    )
			
 
				+    return browser
			
 
				+
			
 
				+
			
 
				+def serive_create():
			
 
				+    option = webdriver.ChromeOptions()
			
 
				+
			
 
				+    option.add_argument('--disable-web-security')
			
 
				+    option.add_argument('--allow-running-insecure-content') 
			
 
				+#     option.add_argument("--user-data-dir=//Users//noodles//Documents//project")
			
 
				+#     option.add_argument("profile-directory="+profilepath)
			
 
				+
			
 
				+    driver = webdriver.Chrome('../../driver/chromedriver_20230202/chromedriver', options=option)
			
 
				+    executor_url = driver.command_executor._url
			
 
				+    session_id = driver.session_id
			
 
				+    print (session_id)
			
 
				+    print (executor_url)
			
 
				+    time.sleep(3)
			
 
				+    
			
 
				+    return driver
			
 
				+
			
 
				+
			
 
				+def string_check(x):
			
 
				+    return x.rstrip().lstrip()
			
 
				+
			
 
				+
			
 
				+def get_content_info(driver):
			
 
				+    shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
			
 
				+    
			
 
				+    post_info = shop_soup.select("a.app-aware-link div.update-components-actor__meta ")[0]
			
 
				+
			
 
				+    post_name = post_info.find('span', class_='t-bold').text
			
 
				+    post_name = string_check(post_name)
			
 
				+
			
 
				+    post_position = post_info.find('span', class_='t-black--light').text
			
 
				+    post_position = string_check(post_position)
			
 
				+    print(post_name, ';', post_position)
			
 
				+    
			
 
				+    content = shop_soup.find('div',class_='feed-shared-update-v2__description-wrapper').select("span[dir='ltr']")[0].text
			
 
				+    print(content)
			
 
				+    
			
 
				+    try:
			
 
				+        content_url = shop_soup.select('div.update-components-article__link-container')[0].find('a').get('href')
			
 
				+    except:
			
 
				+        content_url = ''
			
 
				+        
			
 
				+    return {
			
 
				+        'post_name': post_name,
			
 
				+        'post_position':post_position,
			
 
				+        'content':content, 
			
 
				+        'content_url':content_url
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def linkedin_login(driver, config, user_choose='person2'):
			
 
				+    user = config[user_choose]['user']
			
 
				+    passwd = config[user_choose]['passwd']
			
 
				+    
			
 
				+    user_button = driver.find_element(By.ID, "username")
			
 
				+    driver.implicitly_wait(30)
			
 
				+    ActionChains(driver).move_to_element(user_button).click(user_button).send_keys(user).perform()
			
 
				+    # time.sleep(3)
			
 
				+    
			
 
				+    passwd_button = driver.find_element(By.ID, "password")
			
 
				+    driver.implicitly_wait(30)
			
 
				+    ActionChains(driver).move_to_element(passwd_button).click(passwd_button).send_keys(passwd).send_keys(Keys.ENTER).perform()
			
 
				+    # time.sleep(1)
			
 
				+
			
 
				+
			
 
				+def check_duplicate(table_name, column, db):
			
 
				+    result = db.query(f'SELECT {column} FROM {table_name}')
			
 
				+    result = pd.DataFrame([dict(i) for i in result])
			
 
				+    
			
 
				+    return result[column].to_list()
			
 
				+
			
 
				+
			
 
				+def check_page(driver):
			
 
				+    soup = BeautifulSoup(driver.page_source, 'html.parser')
			
 
				+    try:
			
 
				+        if soup.find('h2', class_='headline-new').text.find('我們無法聯絡到您') != -1:
			
 
				+            print('email error')
			
 
				+            ignore_button = driver.find_element(By.CSS_SELECTOR, "button.secondary-action-new")
			
 
				+            driver.implicitly_wait(30)
			
 
				+            ActionChains(driver).move_to_element(ignore_button).click(ignore_button).perform()  
			
 
				+    except:
			
 
				+        pass
			
 
				+