get_person_list.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.webdriver.common.by import By
  8. from selenium.common.exceptions import TimeoutException
  9. from selenium.common.exceptions import WebDriverException
  10. from utility import *
  11. from const import *
  12. import json
  13. from datetime import datetime
  14. from bs4 import BeautifulSoup
  15. import time
  16. from tqdm import tqdm
  17. import pandas as pd
  18. import dataset
  19. import argparse
  20. import configparser
  21. config = configparser.ConfigParser()
  22. config.read('config.ini')
  23. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/linkedin?charset=utf8mb4')
  24. post_list_table = db[DB_POST]
  25. user_list_table = db[DB_USER]
  26. user_url_list = check_duplicate(DB_USER, 'url', db)
  27. url_list = check_duplicate(DB_POST, 'url', db)
  28. def get_content_info(driver):
  29. shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  30. post_info = shop_soup.select("a.app-aware-link div.update-components-actor__meta ")[0]
  31. post_name = post_info.find('span', class_='t-bold').text
  32. post_name = string_check(post_name)
  33. post_position = post_info.find('span', class_='t-black--light').text
  34. post_position = string_check(post_position)
  35. print(post_name, ';', post_position)
  36. content = shop_soup.find('div',class_='feed-shared-update-v2__description-wrapper').select("span[dir='ltr']")[0].text
  37. print(content)
  38. try:
  39. content_url = shop_soup.select('div.update-components-article__link-container')[0].find('a').get('href')
  40. except:
  41. content_url = ''
  42. return {
  43. 'post_name': post_name,
  44. 'post_position':post_position,
  45. 'content':content,
  46. 'content_url':content_url
  47. }
  48. def get_reaction_button(driver):
  49. more_reaction_button = driver.find_element(By.CSS_SELECTOR, "button[data-jump-link-target='reactors-facepile-see-more-jump-target']")
  50. # print(more_reaction_button)
  51. driver.implicitly_wait(30)
  52. ActionChains(driver).move_to_element(more_reaction_button).click(more_reaction_button).perform()
  53. time.sleep(1)
  54. try:
  55. all_reactions_num = driver.find_element(By.CSS_SELECTOR, "button[data-js-reaction-tab='ALL']").text
  56. all_reactions_num = all_reactions_num.split('\n')[-1]
  57. print(f'reactions numbers: {all_reactions_num}')
  58. except:
  59. all_reactions_num = driver.find_element(By.CSS_SELECTOR, "button[data-js-reaction-tab='LIKE']").text
  60. all_reactions_num = all_reactions_num.split('\n')[-1]
  61. print(f'reactions numbers: {all_reactions_num}')
  62. all_reactions_num = int(all_reactions_num.replace(',', ''))
  63. return all_reactions_num
  64. def show_more_result(driver, all_reactions_num):
  65. if all_reactions_num > 2000:
  66. all_reactions_num = 2000
  67. for i in tqdm(range(int(int(all_reactions_num) / 10 ) + 1)):
  68. for button in driver.find_elements(By.CSS_SELECTOR,'button'):
  69. if button.text == 'Show more results':
  70. try:
  71. button_click = button
  72. ActionChains(driver).move_to_element(button_click).click(button_click).perform()
  73. except:
  74. pass
  75. break
  76. time.sleep(1)
  77. def argparse_setting():
  78. p = argparse.ArgumentParser()
  79. p.add_argument('-u', '--user', nargs='?', const=1, type=str, default='person1')
  80. p.add_argument('-p', '--port', nargs='?', const=1, type=int, default='4446')
  81. p.add_argument('-e', '--enviorment', nargs='?', const=1, type=str, default='windows', help="windows or linux")
  82. p.add_argument('-url', '--url', required = True)
  83. # p.add_argument('--add-feature-a', dest='a', action='store_true', default=False)
  84. return p
  85. def main():
  86. p = argparse_setting()
  87. args = p.parse_args()
  88. print(args.enviorment, args.port, args.user)
  89. if args.enviorment == 'windows':
  90. print('windows web start')
  91. driver = serive_create()
  92. else:
  93. print('linux web start')
  94. driver = brower_start(args.port)
  95. print(f'login in with {args.user}')
  96. url = 'https://www.linkedin.com/login'
  97. driver.get(url)
  98. linkedin_login(driver, config, user_choose=args.user)
  99. time.sleep(2)
  100. check_page(driver)
  101. time.sleep(2)
  102. url = args.url
  103. print(url)
  104. # post info
  105. print('start to crawler...')
  106. for i in range(3):
  107. try:
  108. driver.get(url)
  109. time.sleep(2)
  110. post_output = get_content_info(driver)
  111. post_output['url'] = url
  112. post_output['dt'] = datetime.now()
  113. if url not in url_list:
  114. print('url not in post table')
  115. post_list_table.insert(post_output)
  116. all_reactions_num = get_reaction_button(driver)
  117. post_list_table.upsert({'url':url,'all_reactions_num':all_reactions_num},['url'])
  118. print('upsert success')
  119. break
  120. except:
  121. pass
  122. show_more_result(driver, all_reactions_num)
  123. a_list = driver.find_elements(By.CSS_SELECTOR, "ul.artdeco-list li a[rel='noopener noreferrer']")
  124. print(len(a_list))
  125. # reaction_list = []
  126. print('start to insert...')
  127. for i in a_list:
  128. person_url = i.get_attribute("href")
  129. reaction_info = i.text.split('\n')
  130. person_name = reaction_info[0]
  131. person_position = reaction_info[-1]
  132. if person_url not in user_url_list:
  133. user_list_table.insert({
  134. 'post_url': url,
  135. 'url': person_url,
  136. 'person_name': person_name,
  137. 'person_position': person_position,
  138. 'dt': datetime.now()
  139. })
  140. # reaction_list += [[url, person_url, person_name, person_position]]
  141. if __name__ == '__main__':
  142. main()