person_interest.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.webdriver.common.by import By
  8. from selenium.common.exceptions import TimeoutException
  9. from selenium.common.exceptions import WebDriverException
  10. from utility import *
  11. import json
  12. from datetime import datetime
  13. from bs4 import BeautifulSoup
  14. import time
  15. from tqdm import tqdm
  16. import pandas as pd
  17. import dataset
  18. import argparse
  19. import configparser
  20. config = configparser.ConfigParser()
  21. config.read('config.ini')
  22. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/linkedin?charset=utf8mb4')
  23. company_list_table = db['company']
  24. user_list_table = db['user']
  25. def show_more_result(driver, company_count):
  26. for i in tqdm(range(int(company_count/25)+1)):
  27. for button in driver.find_elements(By.CSS_SELECTOR,'button.scaffold-finite-scroll__load-button'):
  28. if button.text == 'Show more results':
  29. # print(button)
  30. try:
  31. ActionChains(driver).move_to_element(button).click(button).perform()
  32. except:
  33. pass
  34. time.sleep(1)
  35. break
  36. def get_company_from_first_page(interest_button, company_url_list):
  37. company_list = []
  38. for i in interest_button.find_element(By.XPATH,"../../..").find_elements(By.CSS_SELECTOR,' div.artdeco-tabpanel.active ul.pvs-list li.artdeco-list__item'):
  39. company_url = i.find_element(By.CSS_SELECTOR,'a[data-field="active_tab_companies_interests"]').get_attribute('href')
  40. if company_url in company_url_list:
  41. continue
  42. company_name = i.find_element(By.CSS_SELECTOR,'span.t-bold span').text
  43. company_image = i.find_element(By.CSS_SELECTOR,'img').get_attribute('src')
  44. company_followers = int(i.find_element(By.CSS_SELECTOR,'span.t-black--light span').text.replace(' followers','').replace(',',''))
  45. company_list += [company_name]
  46. company_list_table.insert({
  47. 'company_url': company_url,
  48. 'company_name': company_name,
  49. 'company_image': company_image,
  50. 'company_followers': company_followers,
  51. 'dt': datetime.now()
  52. })
  53. return company_list
  54. def get_company_from_next_page(driver, company_url_list):
  55. shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  56. company_list = []
  57. for item in tqdm(shop_soup.find_all('li', class_='pvs-list__paged-list-item')):
  58. try:
  59. company_url = item.select("a[data-field='active_tab_companies_interests']")[0]['href']
  60. if company_url in company_url_list:
  61. continue
  62. company_name = item.find('span', class_= 't-bold').find('span').text
  63. company_image = item.select('div img')[0]['src']
  64. company_followers = item.find('span', 't-black--light').find('span').text
  65. company_followers = int(company_followers.replace(' followers','').replace(',',''))
  66. company_list += [company_name]
  67. company_list_table.insert({
  68. 'company_url': company_url,
  69. 'company_name': company_name,
  70. 'company_image': company_image,
  71. 'company_followers': company_followers,
  72. 'dt': datetime.now()
  73. })
  74. except:
  75. pass
  76. return company_list
  77. def move_to_interest_company_web(driver):
  78. interest_div = driver.find_element(By.ID,'interests')
  79. interest_div_parent = interest_div.find_element(By.XPATH,"..")
  80. # move to company tag
  81. for button in interest_div_parent.find_elements(By.CSS_SELECTOR,'button span'):
  82. if button.text == 'Companies':
  83. interest_button = button
  84. ActionChains(driver).move_to_element(button).click(button).perform()
  85. break
  86. # show all company
  87. company_count = ''
  88. # interest_div = driver.find_element(By.ID,'interests')
  89. # interest_div_parent = interest_div.find_element(By.XPATH,"..")
  90. show_all_company_button = interest_div_parent.find_elements(By.CSS_SELECTOR, 'div.pvs-list__footer-wrapper')
  91. for button in show_all_company_button:
  92. if button.text.find('companies') != -1:
  93. company_count = int(button.text.replace('Show all','')\
  94. .replace('companies','')\
  95. .replace(',', ''))
  96. ActionChains(driver).move_to_element(button).click(button).perform()
  97. break
  98. return company_count, interest_button
  99. def argparse_setting():
  100. p = argparse.ArgumentParser()
  101. p.add_argument('-l', '--limit_count', nargs='?', const=1, type=int, default=20)
  102. p.add_argument('-u', '--user', nargs='?', const=1, type=str, default='person1')
  103. p.add_argument('-p', '--port', nargs='?', const=1, type=int, default='4446')
  104. p.add_argument('-e', '--enviorment', nargs='?', const=1, type=str, default='windows', help="windows or linux")
  105. return p
  106. def main():
  107. p = argparse_setting()
  108. args = p.parse_args()
  109. if args.enviorment == 'windows':
  110. print('windows web start')
  111. driver = serive_create()
  112. else:
  113. print('linux web start')
  114. driver = brower_start(args.port)
  115. url = 'https://www.linkedin.com/login'
  116. driver.get(url)
  117. print(f'login in with {args.user}')
  118. linkedin_login(driver, config, user_choose=args.user)
  119. time.sleep(2)
  120. check_page(driver)
  121. result = db.query(f"SELECT * FROM user where company_list is null ORDER BY RAND() limit {args.limit_count}")
  122. result = pd.DataFrame([dict(i) for i in result])
  123. # try:
  124. print('start to crawler...')
  125. for k, r in result.iterrows():
  126. company_url_list = check_duplicate('company', 'company_url', db)
  127. url = r['url']
  128. driver.get(url)
  129. print(url)
  130. company_count, interest_button = move_to_interest_company_web(driver)
  131. print(f'company_count: {company_count}')
  132. if company_count == '':
  133. company_list = get_company_from_first_page(interest_button, company_url_list)
  134. else:
  135. if company_count > 10:
  136. # more results
  137. if company_count > 2000:
  138. company_count = 2000
  139. show_more_result(driver, company_count)
  140. time.sleep(1)
  141. company_list = get_company_from_next_page(driver, company_url_list)
  142. print(len(company_list))
  143. user_list_table.upsert({'url':url,'company_list':' | '.join(company_list[:2000])},['url'])
  144. time.sleep(2)
  145. # except:
  146. # pass
  147. if __name__ == '__main__':
  148. main()