noodles преди 1 година
родител
ревизия
71dab3f65e
променени са 3 файла, в които са добавени 292 реда и са изтрити 102 реда
  1. 173 0
      get_person_list.py
  2. 18 102
      person_interest.py
  3. 101 0
      utility.py

+ 173 - 0
get_person_list.py

@@ -0,0 +1,173 @@
+# -*- coding: utf-8 -*-
+from selenium import webdriver
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.common.by import By
+from selenium.common.exceptions import TimeoutException
+from selenium.common.exceptions import WebDriverException
+
+from utility import *
+import json
+from datetime import datetime
+from bs4 import BeautifulSoup
+import time
+
+from tqdm import tqdm
+import pandas as pd
+import dataset
+import argparse
+import configparser
+config = configparser.ConfigParser()
+config.read('config.ini')
+db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/linkedin?charset=utf8mb4')
+post_list_table = db['post2']
+user_list_table = db['user']
+user_url_list = check_duplicate('user', 'url', db)
+url_list = check_duplicate('post2', 'url', db)
+
+
+def get_content_info(driver):
+    shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
+    
+    post_info = shop_soup.select("a.app-aware-link div.update-components-actor__meta ")[0]
+
+    post_name = post_info.find('span', class_='t-bold').text
+    post_name = string_check(post_name)
+
+    post_position = post_info.find('span', class_='t-black--light').text
+    post_position = string_check(post_position)
+    print(post_name, ';', post_position)
+    
+    content = shop_soup.find('div',class_='feed-shared-update-v2__description-wrapper').select("span[dir='ltr']")[0].text
+    print(content)
+    
+    try:
+        content_url = shop_soup.select('div.update-components-article__link-container')[0].find('a').get('href')
+    except:
+        content_url = ''
+        
+    return {
+        'post_name': post_name,
+        'post_position':post_position,
+        'content':content, 
+        'content_url':content_url
+    }
+
+
+
+def get_reaction_button(driver):
+    more_reaction_button = driver.find_element(By.CSS_SELECTOR, "button[data-jump-link-target='reactors-facepile-see-more-jump-target']")
+    # print(more_reaction_button)
+    driver.implicitly_wait(30)
+    ActionChains(driver).move_to_element(more_reaction_button).click(more_reaction_button).perform()
+    time.sleep(1)
+
+    try:
+        all_reactions_num = driver.find_element(By.CSS_SELECTOR, "button[data-js-reaction-tab='ALL']").text
+        all_reactions_num = all_reactions_num.split('\n')[-1]
+        print(f'reactions numbers: {all_reactions_num}')
+        
+    except:
+        all_reactions_num = driver.find_element(By.CSS_SELECTOR, "button[data-js-reaction-tab='LIKE']").text
+        all_reactions_num = all_reactions_num.split('\n')[-1]
+        print(f'reactions numbers: {all_reactions_num}')
+
+    all_reactions_num = int(all_reactions_num.replace(',', ''))
+    return all_reactions_num
+
+
+def show_more_result(driver, all_reactions_num):
+    if all_reactions_num > 2000:
+        all_reactions_num = 2000
+
+    for i in tqdm(range(int(int(all_reactions_num) / 10 ) + 1)):
+        for button in driver.find_elements(By.CSS_SELECTOR,'button'):
+            if button.text == 'Show more results':
+                try:
+                    button_click = button
+                    ActionChains(driver).move_to_element(button_click).click(button_click).perform()
+                except:
+                    pass
+                break
+        time.sleep(1)   
+
+def argparse_setting():
+    p = argparse.ArgumentParser()
+    p.add_argument('-u', '--user',  nargs='?', const=1, type=str, default='person1')
+    p.add_argument('-p', '--port',  nargs='?', const=1, type=int, default='4446')
+    p.add_argument('-e', '--enviorment',  nargs='?', const=1, type=str, default='windows', help="windows or linux")
+    p.add_argument('-url', '--url', required = True)
+    # p.add_argument('--add-feature-a', dest='a', action='store_true', default=False)
+    return p
+
+
+def main():
+    p = argparse_setting()
+    args = p.parse_args()
+    print(args.enviorment, args.port, args.user)
+    if args.enviorment == 'windows':
+        print('windows web start')
+        driver = serive_create()
+    else:
+        print('linux web start')
+        driver = brower_start(args.port)
+
+    print(f'login in with {args.user}')
+    url = 'https://www.linkedin.com/login'
+    driver.get(url)
+    linkedin_login(driver, config, user_choose=args.user)
+    time.sleep(2)
+    check_page(driver)
+    time.sleep(2)
+
+    url = args.url
+    print(url)
+    
+    # post info
+    print('start to crawler...')
+    for i in range(3):
+        try:
+            driver.get(url)
+            time.sleep(2)
+            post_output = get_content_info(driver)
+            post_output['url'] = url
+            post_output['dt'] = datetime.now()
+
+            if url not in url_list:
+                print('url not in post table')
+                post_list_table.insert(post_output)
+
+            all_reactions_num = get_reaction_button(driver)
+            post_list_table.upsert({'url':url,'all_reactions_num':all_reactions_num},['url'])
+            print('upsert success')
+            break
+        except:
+            pass
+
+    show_more_result(driver, all_reactions_num)
+
+    a_list = driver.find_elements(By.CSS_SELECTOR, "ul.artdeco-list li a[rel='noopener noreferrer']")
+    print(len(a_list))
+
+    # reaction_list = []
+    print('start to insert...')
+    for i in a_list:
+        person_url = i.get_attribute("href")
+        reaction_info = i.text.split('\n')
+        person_name = reaction_info[0]
+        person_position = reaction_info[-1]
+        
+        if person_url not in user_url_list:
+            user_list_table.insert({
+                'post_url': url, 
+                'url': person_url,
+                'person_name': person_name,
+                'person_position': person_position,
+                'dt':  datetime.now()
+            })
+        # reaction_list += [[url, person_url, person_name, person_position]]
+
+if __name__ == '__main__':
+    main()

+ 18 - 102
person_interest.py

@@ -8,6 +8,7 @@ from selenium.webdriver.common.by import By
 from selenium.common.exceptions import TimeoutException
 from selenium.common.exceptions import WebDriverException
 
+from utility import *
 import json
 from datetime import datetime
 from bs4 import BeautifulSoup
@@ -25,99 +26,6 @@ company_list_table = db['company']
 user_list_table = db['user']
 
 
-def brower_start(port):
-    options = webdriver.ChromeOptions()
-    browser = webdriver.Remote(
-        command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
-        desired_capabilities=options.to_capabilities()
-    )
-    return browser
-
-
-def serive_create():
-    option = webdriver.ChromeOptions()
-
-    option.add_argument('--disable-web-security')
-    option.add_argument('--allow-running-insecure-content') 
-#     option.add_argument("--user-data-dir=//Users//noodles//Documents//project")
-#     option.add_argument("profile-directory="+profilepath)
-
-    driver = webdriver.Chrome('../../driver/chromedriver_20230202/chromedriver', options=option)
-    executor_url = driver.command_executor._url
-    session_id = driver.session_id
-    print (session_id)
-    print (executor_url)
-    time.sleep(3)
-    
-    return driver
-
-
-def string_check(x):
-    return x.rstrip().lstrip()
-
-
-def get_content_info(driver):
-    shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
-    post_info = shop_soup.find('article').select("div[data-test-id='main-feed-activity-card__entity-lockup']")[0]
-    
-    post_name = post_info.find('a', class_='text-sm link-styled no-underline leading-open').text
-    post_name = string_check(post_name)
-
-    post_position = post_info.find('p').text
-    post_position = string_check(post_position)
-
-    print(post_name, ';', post_position)
-    
-    content = shop_soup.find('article').find('p',class_='attributed-text-segment-list__content').text
-    print(content)
-    
-    try:
-        content_url = shop_soup.select("article a[data-tracking-control-name='public_post_feed-article-content']")[0].get('href')
-    except:
-        content_url = ''
-        
-    return {
-        'post_name': post_name,
-        'post_position':post_position,
-        'content':content, 
-        'content_url':content_url
-    }
-
-
-def linkedin_login(driver, config, user_choose='person2'):
-    user = config[user_choose]['user']
-    passwd = config[user_choose]['passwd']
-
-    user_button = driver.find_element(By.ID, "username")
-    driver.implicitly_wait(30)
-    ActionChains(driver).move_to_element(user_button).click(user_button).send_keys(user).perform()
-    # time.sleep(3)
-    
-    passwd_button = driver.find_element(By.ID, "password")
-    driver.implicitly_wait(30)
-    ActionChains(driver).move_to_element(passwd_button).click(passwd_button).send_keys(passwd).send_keys(Keys.ENTER).perform()
-    # time.sleep(1)
-
-
-def check_duplicate(table_name, column):
-    result = db.query(f'SELECT {column} FROM {table_name}')
-    result = pd.DataFrame([dict(i) for i in result])
-    
-    return result[column].to_list()
-
-
-def check_page(driver):
-    soup = BeautifulSoup(driver.page_source, 'html.parser')
-    try:
-        if soup.find('h2', class_='headline-new').text.find('我們無法聯絡到您') != -1:
-            print('email error')
-            ignore_button = driver.find_element(By.CSS_SELECTOR, "button.secondary-action-new")
-            driver.implicitly_wait(30)
-            ActionChains(driver).move_to_element(ignore_button).click(ignore_button).perform()  
-    except:
-        pass
-
-
 def show_more_result(driver, company_count):
     for i in tqdm(range(int(company_count/25)+1)):
         for button in driver.find_elements(By.CSS_SELECTOR,'button.scaffold-finite-scroll__load-button'):
@@ -131,11 +39,13 @@ def show_more_result(driver, company_count):
                 break
 
 
-def get_company_from_first_page(interest_button):
+def get_company_from_first_page(interest_button, company_url_list):
     company_list = []
     for i in interest_button.find_element(By.XPATH,"../../..").find_elements(By.CSS_SELECTOR,' div.artdeco-tabpanel.active ul.pvs-list li.artdeco-list__item'):
-        company_name = i.find_element(By.CSS_SELECTOR,'span.t-bold span').text
         company_url = i.find_element(By.CSS_SELECTOR,'a[data-field="active_tab_companies_interests"]').get_attribute('href')
+        if company_url in company_url_list:
+                continue
+        company_name = i.find_element(By.CSS_SELECTOR,'span.t-bold span').text
         company_image = i.find_element(By.CSS_SELECTOR,'img').get_attribute('src')
         company_followers = int(i.find_element(By.CSS_SELECTOR,'span.t-black--light span').text.replace(' followers','').replace(',',''))
         company_list += [company_name]
@@ -151,12 +61,14 @@ def get_company_from_first_page(interest_button):
     return company_list
 
 
-def get_company_from_next_page(driver):
+def get_company_from_next_page(driver, company_url_list):
     shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
     company_list = []
     for item in tqdm(shop_soup.find_all('li', class_='pvs-list__paged-list-item')):
         try:
             company_url = item.select("a[data-field='active_tab_companies_interests']")[0]['href']
+            if company_url in company_url_list:
+                continue
             company_name = item.find('span', class_= 't-bold').find('span').text
             company_image = item.select('div img')[0]['src']
             company_followers = item.find('span', 't-black--light').find('span').text
@@ -208,8 +120,8 @@ def argparse_setting():
     p.add_argument('-l', '--limit_count',  nargs='?', const=1, type=int, default=20)
     p.add_argument('-u', '--user',  nargs='?', const=1, type=str, default='person1')
     p.add_argument('-p', '--port',  nargs='?', const=1, type=int, default='4446')
-    p.add_argument('-e', '--enviorment',  nargs='?', const=1, type=str, default='windows')
-    # p.add_argument('--add-feature-a', dest='a', action='store_true', default=False)
+    p.add_argument('-e', '--enviorment',  nargs='?', const=1, type=str, default='windows', help="windows or linux")
+
     return p
 
 
@@ -217,13 +129,16 @@ def main():
     p = argparse_setting()
     args = p.parse_args()
 
-    if args.enviorment == 'winodws':
+    if args.enviorment == 'windows':
+        print('windows web start')
         driver = serive_create()
     else:
+        print('linux web start')
         driver = brower_start(args.port)
     url = 'https://www.linkedin.com/login'
     driver.get(url)
 
+    print(f'login in with {args.user}')
     linkedin_login(driver, config, user_choose=args.user)
     time.sleep(2)
     check_page(driver)
@@ -232,8 +147,9 @@ def main():
     result = pd.DataFrame([dict(i) for i in result])
 
     # try:
+    print('start to crawler...')
     for k, r in result.iterrows():
-        company_url_list = check_duplicate('company', 'company_url')
+        company_url_list = check_duplicate('company', 'company_url', db)
         
         url = r['url']
         driver.get(url)
@@ -242,7 +158,7 @@ def main():
         print(f'company_count: {company_count}')
         
         if company_count == '':
-            company_list = get_company_from_first_page(interest_button)
+            company_list = get_company_from_first_page(interest_button, company_url_list)
 
         else:
             if company_count > 10:
@@ -251,7 +167,7 @@ def main():
                     company_count = 2000
                 show_more_result(driver, company_count)
                 time.sleep(1) 
-            company_list = get_company_from_next_page(driver)
+            company_list = get_company_from_next_page(driver, company_url_list)
         
         print(len(company_list))
         user_list_table.upsert({'url':url,'company_list':' | '.join(company_list[:2000])},['url'])

+ 101 - 0
utility.py

@@ -0,0 +1,101 @@
+from bs4 import BeautifulSoup
+import pandas as pd
+from selenium.webdriver.common.by import By
+from selenium import webdriver
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.common.keys import Keys
+import time
+
+
+def brower_start(port):
+    options = webdriver.ChromeOptions()
+    browser = webdriver.Remote(
+        command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
+        desired_capabilities=options.to_capabilities()
+    )
+    return browser
+
+
+def serive_create():
+    option = webdriver.ChromeOptions()
+
+    option.add_argument('--disable-web-security')
+    option.add_argument('--allow-running-insecure-content') 
+#     option.add_argument("--user-data-dir=//Users//noodles//Documents//project")
+#     option.add_argument("profile-directory="+profilepath)
+
+    driver = webdriver.Chrome('../../driver/chromedriver_20230202/chromedriver', options=option)
+    executor_url = driver.command_executor._url
+    session_id = driver.session_id
+    print (session_id)
+    print (executor_url)
+    time.sleep(3)
+    
+    return driver
+
+
+def string_check(x):
+    return x.rstrip().lstrip()
+
+
+def get_content_info(driver):
+    shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
+    
+    post_info = shop_soup.select("a.app-aware-link div.update-components-actor__meta ")[0]
+
+    post_name = post_info.find('span', class_='t-bold').text
+    post_name = string_check(post_name)
+
+    post_position = post_info.find('span', class_='t-black--light').text
+    post_position = string_check(post_position)
+    print(post_name, ';', post_position)
+    
+    content = shop_soup.find('div',class_='feed-shared-update-v2__description-wrapper').select("span[dir='ltr']")[0].text
+    print(content)
+    
+    try:
+        content_url = shop_soup.select('div.update-components-article__link-container')[0].find('a').get('href')
+    except:
+        content_url = ''
+        
+    return {
+        'post_name': post_name,
+        'post_position':post_position,
+        'content':content, 
+        'content_url':content_url
+    }
+
+
+def linkedin_login(driver, config, user_choose='person2'):
+    user = config[user_choose]['user']
+    passwd = config[user_choose]['passwd']
+    
+    user_button = driver.find_element(By.ID, "username")
+    driver.implicitly_wait(30)
+    ActionChains(driver).move_to_element(user_button).click(user_button).send_keys(user).perform()
+    # time.sleep(3)
+    
+    passwd_button = driver.find_element(By.ID, "password")
+    driver.implicitly_wait(30)
+    ActionChains(driver).move_to_element(passwd_button).click(passwd_button).send_keys(passwd).send_keys(Keys.ENTER).perform()
+    # time.sleep(1)
+
+
+def check_duplicate(table_name, column, db):
+    result = db.query(f'SELECT {column} FROM {table_name}')
+    result = pd.DataFrame([dict(i) for i in result])
+    
+    return result[column].to_list()
+
+
+def check_page(driver):
+    soup = BeautifulSoup(driver.page_source, 'html.parser')
+    try:
+        if soup.find('h2', class_='headline-new').text.find('我們無法聯絡到您') != -1:
+            print('email error')
+            ignore_button = driver.find_element(By.CSS_SELECTOR, "button.secondary-action-new")
+            driver.implicitly_wait(30)
+            ActionChains(driver).move_to_element(ignore_button).click(ignore_button).perform()  
+    except:
+        pass
+