person_interest.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.webdriver.common.by import By
  8. from selenium.common.exceptions import TimeoutException
  9. from selenium.common.exceptions import WebDriverException
  10. from utility import *
  11. from const import *
  12. import json
  13. from datetime import datetime
  14. from bs4 import BeautifulSoup
  15. import time, os
  16. from tqdm import tqdm
  17. import pandas as pd
  18. import dataset
  19. import argparse
  20. import configparser
  21. config = configparser.ConfigParser()
  22. config.read('config.ini')
  23. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/linkedin?charset=utf8mb4')
  24. company_list_table = db[DB_COMPANY]
  25. user_list_table = db[DB_USER]
  26. def show_more_result(driver, company_count):
  27. for i in tqdm(range(int(company_count/25)+1)):
  28. for button in driver.find_elements(By.CSS_SELECTOR,'button.scaffold-finite-scroll__load-button'):
  29. if button.text == 'Show more results':
  30. # print(button)
  31. try:
  32. ActionChains(driver).move_to_element(button).click(button).perform()
  33. except:
  34. pass
  35. time.sleep(1)
  36. break
  37. def get_company_from_first_page(interest_button, company_url_list):
  38. company_list = []
  39. for i in interest_button.find_element(By.XPATH,"../../..").find_elements(By.CSS_SELECTOR,' div.artdeco-tabpanel.active ul.pvs-list li.artdeco-list__item'):
  40. company_url = i.find_element(By.CSS_SELECTOR,'a[data-field="active_tab_companies_interests"]').get_attribute('href')
  41. if company_url in company_url_list:
  42. continue
  43. company_name = i.find_element(By.CSS_SELECTOR,'span.t-bold span').text
  44. company_image = i.find_element(By.CSS_SELECTOR,'img').get_attribute('src')
  45. company_followers = int(i.find_element(By.CSS_SELECTOR,'span.t-black--light span').text.replace(' followers','').replace(',',''))
  46. company_list += [company_name]
  47. company_list_table.insert({
  48. 'company_url': company_url,
  49. 'company_name': company_name,
  50. 'company_image': company_image,
  51. 'company_followers': company_followers,
  52. 'dt': datetime.now()
  53. })
  54. return company_list
  55. def get_company_from_next_page(driver, company_url_list):
  56. shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  57. company_list = []
  58. for item in tqdm(shop_soup.find_all('li', class_='pvs-list__paged-list-item')):
  59. try:
  60. company_url = item.select("a[data-field='active_tab_companies_interests']")[0]['href']
  61. if company_url in company_url_list:
  62. continue
  63. company_name = item.find('span', class_= 't-bold').find('span').text
  64. company_image = item.select('div img')[0]['src']
  65. company_followers = item.find('span', 't-black--light').find('span').text
  66. company_followers = int(company_followers.replace(' followers','').replace(',',''))
  67. company_list += [company_name]
  68. company_list_table.insert({
  69. 'company_url': company_url,
  70. 'company_name': company_name,
  71. 'company_image': company_image,
  72. 'company_followers': company_followers,
  73. 'dt': datetime.now()
  74. })
  75. except:
  76. pass
  77. return company_list
  78. def move_to_interest_company_web(driver):
  79. interest_div = driver.find_element(By.ID,'interests')
  80. interest_div_parent = interest_div.find_element(By.XPATH,"..")
  81. # move to company tag
  82. for button in interest_div_parent.find_elements(By.CSS_SELECTOR,'button span'):
  83. if button.text == 'Companies':
  84. interest_button = button
  85. ActionChains(driver).move_to_element(button).click(button).perform()
  86. break
  87. # show all company
  88. company_count = ''
  89. # interest_div = driver.find_element(By.ID,'interests')
  90. # interest_div_parent = interest_div.find_element(By.XPATH,"..")
  91. show_all_company_button = interest_div_parent.find_elements(By.CSS_SELECTOR, 'div.pvs-list__footer-wrapper')
  92. for button in show_all_company_button:
  93. if button.text.find('companies') != -1:
  94. company_count = int(button.text.replace('Show all','')\
  95. .replace('companies','')\
  96. .replace(',', ''))
  97. ActionChains(driver).move_to_element(button).click(button).perform()
  98. break
  99. return company_count, interest_button
  100. def argparse_setting():
  101. p = argparse.ArgumentParser()
  102. p.add_argument('-l', '--limit_count', nargs='?', const=1, type=int, default=20)
  103. p.add_argument('-u', '--user', nargs='?', const=1, type=str, default='person1')
  104. p.add_argument('-p', '--port', nargs='?', const=1, type=int, default='4446')
  105. p.add_argument('-e', '--enviorment', nargs='?', const=1, type=str, default='windows', help="windows or linux")
  106. return p
  107. def main():
  108. p = argparse_setting()
  109. args = p.parse_args()
  110. if args.enviorment == 'windows':
  111. print('windows web start')
  112. driver = serive_create()
  113. else:
  114. print('linux web start')
  115. print('restart docker p{}'.format(args.port))
  116. os.system('sudo docker container restart p'+str(args.port))
  117. time.sleep(8)
  118. driver = brower_start(args.port, 8787, False)
  119. url = 'https://www.linkedin.com/login'
  120. driver.get(url)
  121. print(f'login in with {args.user}')
  122. linkedin_login(driver, config, user_choose=args.user)
  123. time.sleep(2)
  124. check_page(driver)
  125. result = db.query(f"SELECT * FROM {DB_USER} where company_list is null ORDER BY RAND() limit {args.limit_count}")
  126. result = pd.DataFrame([dict(i) for i in result])
  127. # try:
  128. print('start to crawler...')
  129. for k, r in result.iterrows():
  130. company_url_list = check_duplicate(DB_COMPANY, 'company_url', db)
  131. url = r['url']
  132. driver.get(url)
  133. print(url)
  134. company_count, interest_button = move_to_interest_company_web(driver)
  135. print(f'company_count: {company_count}')
  136. if company_count == '':
  137. company_list = get_company_from_first_page(interest_button, company_url_list)
  138. else:
  139. if company_count > 10:
  140. # more results
  141. if company_count > 2000:
  142. company_count = 2000
  143. show_more_result(driver, company_count)
  144. time.sleep(1)
  145. company_list = get_company_from_next_page(driver, company_url_list)
  146. print(len(company_list))
  147. user_list_table.upsert({'url':url,'company_list':' | '.join(company_list[:2000])},['url'])
  148. time.sleep(2)
  149. # except:
  150. # pass
  151. if __name__ == '__main__':
  152. main()