get_person_list.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.webdriver.common.by import By
  8. from selenium.common.exceptions import TimeoutException
  9. from selenium.common.exceptions import WebDriverException
  10. from utility import *
  11. import json
  12. from datetime import datetime
  13. from bs4 import BeautifulSoup
  14. import time
  15. from tqdm import tqdm
  16. import pandas as pd
  17. import dataset
  18. import argparse
  19. import configparser
  20. config = configparser.ConfigParser()
  21. config.read('config.ini')
  22. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/linkedin?charset=utf8mb4')
  23. post_list_table = db['post2']
  24. user_list_table = db['user']
  25. user_url_list = check_duplicate('user', 'url', db)
  26. url_list = check_duplicate('post2', 'url', db)
  27. def get_content_info(driver):
  28. shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  29. post_info = shop_soup.select("a.app-aware-link div.update-components-actor__meta ")[0]
  30. post_name = post_info.find('span', class_='t-bold').text
  31. post_name = string_check(post_name)
  32. post_position = post_info.find('span', class_='t-black--light').text
  33. post_position = string_check(post_position)
  34. print(post_name, ';', post_position)
  35. content = shop_soup.find('div',class_='feed-shared-update-v2__description-wrapper').select("span[dir='ltr']")[0].text
  36. print(content)
  37. try:
  38. content_url = shop_soup.select('div.update-components-article__link-container')[0].find('a').get('href')
  39. except:
  40. content_url = ''
  41. return {
  42. 'post_name': post_name,
  43. 'post_position':post_position,
  44. 'content':content,
  45. 'content_url':content_url
  46. }
  47. def get_reaction_button(driver):
  48. more_reaction_button = driver.find_element(By.CSS_SELECTOR, "button[data-jump-link-target='reactors-facepile-see-more-jump-target']")
  49. # print(more_reaction_button)
  50. driver.implicitly_wait(30)
  51. ActionChains(driver).move_to_element(more_reaction_button).click(more_reaction_button).perform()
  52. time.sleep(1)
  53. try:
  54. all_reactions_num = driver.find_element(By.CSS_SELECTOR, "button[data-js-reaction-tab='ALL']").text
  55. all_reactions_num = all_reactions_num.split('\n')[-1]
  56. print(f'reactions numbers: {all_reactions_num}')
  57. except:
  58. all_reactions_num = driver.find_element(By.CSS_SELECTOR, "button[data-js-reaction-tab='LIKE']").text
  59. all_reactions_num = all_reactions_num.split('\n')[-1]
  60. print(f'reactions numbers: {all_reactions_num}')
  61. all_reactions_num = int(all_reactions_num.replace(',', ''))
  62. return all_reactions_num
  63. def show_more_result(driver, all_reactions_num):
  64. if all_reactions_num > 2000:
  65. all_reactions_num = 2000
  66. for i in tqdm(range(int(int(all_reactions_num) / 10 ) + 1)):
  67. for button in driver.find_elements(By.CSS_SELECTOR,'button'):
  68. if button.text == 'Show more results':
  69. try:
  70. button_click = button
  71. ActionChains(driver).move_to_element(button_click).click(button_click).perform()
  72. except:
  73. pass
  74. break
  75. time.sleep(1)
  76. def argparse_setting():
  77. p = argparse.ArgumentParser()
  78. p.add_argument('-u', '--user', nargs='?', const=1, type=str, default='person1')
  79. p.add_argument('-p', '--port', nargs='?', const=1, type=int, default='4446')
  80. p.add_argument('-e', '--enviorment', nargs='?', const=1, type=str, default='windows', help="windows or linux")
  81. p.add_argument('-url', '--url', required = True)
  82. # p.add_argument('--add-feature-a', dest='a', action='store_true', default=False)
  83. return p
  84. def main():
  85. p = argparse_setting()
  86. args = p.parse_args()
  87. print(args.enviorment, args.port, args.user)
  88. if args.enviorment == 'windows':
  89. print('windows web start')
  90. driver = serive_create()
  91. else:
  92. print('linux web start')
  93. driver = brower_start(args.port)
  94. print(f'login in with {args.user}')
  95. url = 'https://www.linkedin.com/login'
  96. driver.get(url)
  97. linkedin_login(driver, config, user_choose=args.user)
  98. time.sleep(2)
  99. check_page(driver)
  100. time.sleep(2)
  101. url = args.url
  102. print(url)
  103. # post info
  104. print('start to crawler...')
  105. for i in range(3):
  106. try:
  107. driver.get(url)
  108. time.sleep(2)
  109. post_output = get_content_info(driver)
  110. post_output['url'] = url
  111. post_output['dt'] = datetime.now()
  112. if url not in url_list:
  113. print('url not in post table')
  114. post_list_table.insert(post_output)
  115. all_reactions_num = get_reaction_button(driver)
  116. post_list_table.upsert({'url':url,'all_reactions_num':all_reactions_num},['url'])
  117. print('upsert success')
  118. break
  119. except:
  120. pass
  121. show_more_result(driver, all_reactions_num)
  122. a_list = driver.find_elements(By.CSS_SELECTOR, "ul.artdeco-list li a[rel='noopener noreferrer']")
  123. print(len(a_list))
  124. # reaction_list = []
  125. print('start to insert...')
  126. for i in a_list:
  127. person_url = i.get_attribute("href")
  128. reaction_info = i.text.split('\n')
  129. person_name = reaction_info[0]
  130. person_position = reaction_info[-1]
  131. if person_url not in user_url_list:
  132. user_list_table.insert({
  133. 'post_url': url,
  134. 'url': person_url,
  135. 'person_name': person_name,
  136. 'person_position': person_position,
  137. 'dt': datetime.now()
  138. })
  139. # reaction_list += [[url, person_url, person_name, person_position]]
  140. if __name__ == '__main__':
  141. main()