Parcourir la source

'crawler_person_interest.py'

noodles il y a 1 an
Parent
commit
cf9b8e9e14
2 fichiers modifiés avec 278 ajouts et 0 suppressions
  1. 11 0
      config.ini
  2. 267 0
      person_interest.py

+ 11 - 0
config.ini

@@ -0,0 +1,11 @@
+[person1]
+user = 
+passwd = 
+
+[person2]
+user = 
+passwd = 
+
+[person3]
+user = 
+passwd = 

+ 267 - 0
person_interest.py

@@ -0,0 +1,267 @@
+# -*- coding: utf-8 -*-
+from selenium import webdriver
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.common.by import By
+from selenium.common.exceptions import TimeoutException
+from selenium.common.exceptions import WebDriverException
+
+import json
+from datetime import datetime
+from bs4 import BeautifulSoup
+import time
+
+from tqdm import tqdm
+import pandas as pd
+import dataset
+import argparse
+import configparser
+config = configparser.ConfigParser()
+config.read('config.ini')
+db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/linkedin?charset=utf8mb4')
+company_list_table = db['company']
+user_list_table = db['user']
+
+
+def brower_start(port):
+    options = webdriver.ChromeOptions()
+    browser = webdriver.Remote(
+        command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
+        desired_capabilities=options.to_capabilities()
+    )
+    return browser
+
+
+def serive_create():
+    option = webdriver.ChromeOptions()
+
+    option.add_argument('--disable-web-security')
+    option.add_argument('--allow-running-insecure-content') 
+#     option.add_argument("--user-data-dir=//Users//noodles//Documents//project")
+#     option.add_argument("profile-directory="+profilepath)
+
+    driver = webdriver.Chrome('../../driver/chromedriver_20230202/chromedriver', options=option)
+    executor_url = driver.command_executor._url
+    session_id = driver.session_id
+    print (session_id)
+    print (executor_url)
+    time.sleep(3)
+    
+    return driver
+
+
+def string_check(x):
+    return x.rstrip().lstrip()
+
+
+def get_content_info(driver):
+    shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
+    post_info = shop_soup.find('article').select("div[data-test-id='main-feed-activity-card__entity-lockup']")[0]
+    
+    post_name = post_info.find('a', class_='text-sm link-styled no-underline leading-open').text
+    post_name = string_check(post_name)
+
+    post_position = post_info.find('p').text
+    post_position = string_check(post_position)
+
+    print(post_name, ';', post_position)
+    
+    content = shop_soup.find('article').find('p',class_='attributed-text-segment-list__content').text
+    print(content)
+    
+    try:
+        content_url = shop_soup.select("article a[data-tracking-control-name='public_post_feed-article-content']")[0].get('href')
+    except:
+        content_url = ''
+        
+    return {
+        'post_name': post_name,
+        'post_position':post_position,
+        'content':content, 
+        'content_url':content_url
+    }
+
+
+def linkedin_login(driver, config, user_choose='person2'):
+    user = config[user_choose]['user']
+    passwd = config[user_choose]['passwd']
+
+    user_button = driver.find_element(By.ID, "username")
+    driver.implicitly_wait(30)
+    ActionChains(driver).move_to_element(user_button).click(user_button).send_keys(user).perform()
+    # time.sleep(3)
+    
+    passwd_button = driver.find_element(By.ID, "password")
+    driver.implicitly_wait(30)
+    ActionChains(driver).move_to_element(passwd_button).click(passwd_button).send_keys(passwd).send_keys(Keys.ENTER).perform()
+    # time.sleep(1)
+
+
+def check_duplicate(table_name, column):
+    result = db.query(f'SELECT {column} FROM {table_name}')
+    result = pd.DataFrame([dict(i) for i in result])
+    
+    return result[column].to_list()
+
+
+def check_page(driver):
+    soup = BeautifulSoup(driver.page_source, 'html.parser')
+    try:
+        if soup.find('h2', class_='headline-new').text.find('我們無法聯絡到您') != -1:
+            print('email error')
+            ignore_button = driver.find_element(By.CSS_SELECTOR, "button.secondary-action-new")
+            driver.implicitly_wait(30)
+            ActionChains(driver).move_to_element(ignore_button).click(ignore_button).perform()  
+    except:
+        pass
+
+
+def show_more_result(driver, company_count):
+    for i in tqdm(range(int(company_count/25)+1)):
+        for button in driver.find_elements(By.CSS_SELECTOR,'button.scaffold-finite-scroll__load-button'):
+            if button.text == 'Show more results':
+#                     print(button)
+                try:
+                    ActionChains(driver).move_to_element(button).click(button).perform()
+                except:
+                    pass
+                time.sleep(1)
+                break
+
+
+def get_company_from_first_page(interest_button):
+    company_list = []
+    for i in interest_button.find_element(By.XPATH,"../../..").find_elements(By.CSS_SELECTOR,' div.artdeco-tabpanel.active ul.pvs-list li.artdeco-list__item'):
+        company_name = i.find_element(By.CSS_SELECTOR,'span.t-bold span').text
+        company_url = i.find_element(By.CSS_SELECTOR,'a[data-field="active_tab_companies_interests"]').get_attribute('href')
+        company_image = i.find_element(By.CSS_SELECTOR,'img').get_attribute('src')
+        company_followers = int(i.find_element(By.CSS_SELECTOR,'span.t-black--light span').text.replace(' followers','').replace(',',''))
+        company_list += [company_name]
+
+        company_list_table.insert({
+            'company_url': company_url, 
+            'company_name': company_name,
+            'company_image': company_image,
+            'company_followers': company_followers,
+            'dt':  datetime.now()
+        })
+
+    return company_list
+
+
+def get_company_from_next_page(driver):
+    shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
+    company_list = []
+    for item in tqdm(shop_soup.find_all('li', class_='pvs-list__paged-list-item')):
+        try:
+            company_url = item.select("a[data-field='active_tab_companies_interests']")[0]['href']
+            company_name = item.find('span', class_= 't-bold').find('span').text
+            company_image = item.select('div img')[0]['src']
+            company_followers = item.find('span', 't-black--light').find('span').text
+            company_followers = int(company_followers.replace(' followers','').replace(',',''))
+            company_list += [company_name]
+
+            company_list_table.insert({
+                'company_url': company_url, 
+                'company_name': company_name,
+                'company_image': company_image,
+                'company_followers': company_followers,
+                'dt':  datetime.now()
+            })
+        except:
+            pass
+
+    return company_list
+
+
+def move_to_interest_company_web(driver):
+    interest_div = driver.find_element(By.ID,'interests')
+    interest_div_parent = interest_div.find_element(By.XPATH,"..")
+
+    # move to company tag
+    for button in interest_div_parent.find_elements(By.CSS_SELECTOR,'button span'):
+        if button.text == 'Companies':
+            interest_button = button
+            ActionChains(driver).move_to_element(button).click(button).perform()
+            break
+            
+    # show all company
+    company_count = ''
+    # interest_div = driver.find_element(By.ID,'interests')
+    # interest_div_parent = interest_div.find_element(By.XPATH,"..")
+    show_all_company_button = interest_div_parent.find_elements(By.CSS_SELECTOR, 'div.pvs-list__footer-wrapper')
+    for button in show_all_company_button:
+        if button.text.find('companies') != -1:
+            company_count = int(button.text.replace('Show all','')\
+                                        .replace('companies','')\
+                                        .replace(',', ''))
+            ActionChains(driver).move_to_element(button).click(button).perform()
+            break
+
+    return company_count, interest_button
+
+
+def argparse_setting():
+    p = argparse.ArgumentParser()
+    p.add_argument('-l', '--limit_count',  nargs='?', const=1, type=int, default=20)
+    p.add_argument('-u', '--user',  nargs='?', const=1, type=str, default='person1')
+    p.add_argument('-p', '--port',  nargs='?', const=1, type=int, default='4446')
+    p.add_argument('-e', '--enviorment',  nargs='?', const=1, type=str, default='windows')
+    # p.add_argument('--add-feature-a', dest='a', action='store_true', default=False)
+    return p
+
+
+def main():
+    p = argparse_setting()
+    args = p.parse_args()
+
+    if args.enviorment == 'winodws':
+        driver = serive_create()
+    else:
+        driver = brower_start(args.port)
+    url = 'https://www.linkedin.com/login'
+    driver.get(url)
+
+    linkedin_login(driver, config, user_choose=args.user)
+    time.sleep(2)
+    check_page(driver)
+
+    result = db.query(f"SELECT * FROM user where company_list is null ORDER BY RAND() limit {args.limit_count}")
+    result = pd.DataFrame([dict(i) for i in result])
+
+    # try:
+    for k, r in result.iterrows():
+        company_url_list = check_duplicate('company', 'company_url')
+        
+        url = r['url']
+        driver.get(url)
+        print(url)
+        company_count, interest_button = move_to_interest_company_web(driver)
+        print(f'company_count: {company_count}')
+        
+        if company_count == '':
+            company_list = get_company_from_first_page(interest_button)
+
+        else:
+            if company_count > 10:
+                # more results
+                if company_count > 2000:
+                    company_count = 2000
+                show_more_result(driver, company_count)
+                time.sleep(1) 
+            company_list = get_company_from_next_page(driver)
+        
+        print(len(company_list))
+        user_list_table.upsert({'url':url,'company_list':' | '.join(company_list[:2000])},['url'])
+        time.sleep(2)
+
+    # except:
+        # pass
+    
+    
+if __name__ == '__main__':
+    main()
+
+