person_interest.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.webdriver.common.by import By
  8. from selenium.common.exceptions import TimeoutException
  9. from selenium.common.exceptions import WebDriverException
  10. import json
  11. from datetime import datetime
  12. from bs4 import BeautifulSoup
  13. import time
  14. from tqdm import tqdm
  15. import pandas as pd
  16. import dataset
  17. import argparse
  18. import configparser
  19. config = configparser.ConfigParser()
  20. config.read('config.ini')
  21. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/linkedin?charset=utf8mb4')
  22. company_list_table = db['company']
  23. user_list_table = db['user']
  24. def brower_start(port):
  25. options = webdriver.ChromeOptions()
  26. browser = webdriver.Remote(
  27. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  28. desired_capabilities=options.to_capabilities()
  29. )
  30. return browser
  31. def serive_create():
  32. option = webdriver.ChromeOptions()
  33. option.add_argument('--disable-web-security')
  34. option.add_argument('--allow-running-insecure-content')
  35. # option.add_argument("--user-data-dir=//Users//noodles//Documents//project")
  36. # option.add_argument("profile-directory="+profilepath)
  37. driver = webdriver.Chrome('../../driver/chromedriver_20230202/chromedriver', options=option)
  38. executor_url = driver.command_executor._url
  39. session_id = driver.session_id
  40. print (session_id)
  41. print (executor_url)
  42. time.sleep(3)
  43. return driver
  44. def string_check(x):
  45. return x.rstrip().lstrip()
  46. def get_content_info(driver):
  47. shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  48. post_info = shop_soup.find('article').select("div[data-test-id='main-feed-activity-card__entity-lockup']")[0]
  49. post_name = post_info.find('a', class_='text-sm link-styled no-underline leading-open').text
  50. post_name = string_check(post_name)
  51. post_position = post_info.find('p').text
  52. post_position = string_check(post_position)
  53. print(post_name, ';', post_position)
  54. content = shop_soup.find('article').find('p',class_='attributed-text-segment-list__content').text
  55. print(content)
  56. try:
  57. content_url = shop_soup.select("article a[data-tracking-control-name='public_post_feed-article-content']")[0].get('href')
  58. except:
  59. content_url = ''
  60. return {
  61. 'post_name': post_name,
  62. 'post_position':post_position,
  63. 'content':content,
  64. 'content_url':content_url
  65. }
  66. def linkedin_login(driver, config, user_choose='person2'):
  67. user = config[user_choose]['user']
  68. passwd = config[user_choose]['passwd']
  69. user_button = driver.find_element(By.ID, "username")
  70. driver.implicitly_wait(30)
  71. ActionChains(driver).move_to_element(user_button).click(user_button).send_keys(user).perform()
  72. # time.sleep(3)
  73. passwd_button = driver.find_element(By.ID, "password")
  74. driver.implicitly_wait(30)
  75. ActionChains(driver).move_to_element(passwd_button).click(passwd_button).send_keys(passwd).send_keys(Keys.ENTER).perform()
  76. # time.sleep(1)
  77. def check_duplicate(table_name, column):
  78. result = db.query(f'SELECT {column} FROM {table_name}')
  79. result = pd.DataFrame([dict(i) for i in result])
  80. return result[column].to_list()
  81. def check_page(driver):
  82. soup = BeautifulSoup(driver.page_source, 'html.parser')
  83. try:
  84. if soup.find('h2', class_='headline-new').text.find('我們無法聯絡到您') != -1:
  85. print('email error')
  86. ignore_button = driver.find_element(By.CSS_SELECTOR, "button.secondary-action-new")
  87. driver.implicitly_wait(30)
  88. ActionChains(driver).move_to_element(ignore_button).click(ignore_button).perform()
  89. except:
  90. pass
  91. def show_more_result(driver, company_count):
  92. for i in tqdm(range(int(company_count/25)+1)):
  93. for button in driver.find_elements(By.CSS_SELECTOR,'button.scaffold-finite-scroll__load-button'):
  94. if button.text == 'Show more results':
  95. # print(button)
  96. try:
  97. ActionChains(driver).move_to_element(button).click(button).perform()
  98. except:
  99. pass
  100. time.sleep(1)
  101. break
  102. def get_company_from_first_page(interest_button):
  103. company_list = []
  104. for i in interest_button.find_element(By.XPATH,"../../..").find_elements(By.CSS_SELECTOR,' div.artdeco-tabpanel.active ul.pvs-list li.artdeco-list__item'):
  105. company_name = i.find_element(By.CSS_SELECTOR,'span.t-bold span').text
  106. company_url = i.find_element(By.CSS_SELECTOR,'a[data-field="active_tab_companies_interests"]').get_attribute('href')
  107. company_image = i.find_element(By.CSS_SELECTOR,'img').get_attribute('src')
  108. company_followers = int(i.find_element(By.CSS_SELECTOR,'span.t-black--light span').text.replace(' followers','').replace(',',''))
  109. company_list += [company_name]
  110. company_list_table.insert({
  111. 'company_url': company_url,
  112. 'company_name': company_name,
  113. 'company_image': company_image,
  114. 'company_followers': company_followers,
  115. 'dt': datetime.now()
  116. })
  117. return company_list
  118. def get_company_from_next_page(driver):
  119. shop_soup = BeautifulSoup(driver.page_source, 'html.parser')
  120. company_list = []
  121. for item in tqdm(shop_soup.find_all('li', class_='pvs-list__paged-list-item')):
  122. try:
  123. company_url = item.select("a[data-field='active_tab_companies_interests']")[0]['href']
  124. company_name = item.find('span', class_= 't-bold').find('span').text
  125. company_image = item.select('div img')[0]['src']
  126. company_followers = item.find('span', 't-black--light').find('span').text
  127. company_followers = int(company_followers.replace(' followers','').replace(',',''))
  128. company_list += [company_name]
  129. company_list_table.insert({
  130. 'company_url': company_url,
  131. 'company_name': company_name,
  132. 'company_image': company_image,
  133. 'company_followers': company_followers,
  134. 'dt': datetime.now()
  135. })
  136. except:
  137. pass
  138. return company_list
  139. def move_to_interest_company_web(driver):
  140. interest_div = driver.find_element(By.ID,'interests')
  141. interest_div_parent = interest_div.find_element(By.XPATH,"..")
  142. # move to company tag
  143. for button in interest_div_parent.find_elements(By.CSS_SELECTOR,'button span'):
  144. if button.text == 'Companies':
  145. interest_button = button
  146. ActionChains(driver).move_to_element(button).click(button).perform()
  147. break
  148. # show all company
  149. company_count = ''
  150. # interest_div = driver.find_element(By.ID,'interests')
  151. # interest_div_parent = interest_div.find_element(By.XPATH,"..")
  152. show_all_company_button = interest_div_parent.find_elements(By.CSS_SELECTOR, 'div.pvs-list__footer-wrapper')
  153. for button in show_all_company_button:
  154. if button.text.find('companies') != -1:
  155. company_count = int(button.text.replace('Show all','')\
  156. .replace('companies','')\
  157. .replace(',', ''))
  158. ActionChains(driver).move_to_element(button).click(button).perform()
  159. break
  160. return company_count, interest_button
  161. def argparse_setting():
  162. p = argparse.ArgumentParser()
  163. p.add_argument('-l', '--limit_count', nargs='?', const=1, type=int, default=20)
  164. p.add_argument('-u', '--user', nargs='?', const=1, type=str, default='person1')
  165. p.add_argument('-p', '--port', nargs='?', const=1, type=int, default='4446')
  166. p.add_argument('-e', '--enviorment', nargs='?', const=1, type=str, default='windows')
  167. # p.add_argument('--add-feature-a', dest='a', action='store_true', default=False)
  168. return p
  169. def main():
  170. p = argparse_setting()
  171. args = p.parse_args()
  172. if args.enviorment == 'winodws':
  173. driver = serive_create()
  174. else:
  175. driver = brower_start(args.port)
  176. url = 'https://www.linkedin.com/login'
  177. driver.get(url)
  178. linkedin_login(driver, config, user_choose=args.user)
  179. time.sleep(2)
  180. check_page(driver)
  181. result = db.query(f"SELECT * FROM user where company_list is null ORDER BY RAND() limit {args.limit_count}")
  182. result = pd.DataFrame([dict(i) for i in result])
  183. # try:
  184. for k, r in result.iterrows():
  185. company_url_list = check_duplicate('company', 'company_url')
  186. url = r['url']
  187. driver.get(url)
  188. print(url)
  189. company_count, interest_button = move_to_interest_company_web(driver)
  190. print(f'company_count: {company_count}')
  191. if company_count == '':
  192. company_list = get_company_from_first_page(interest_button)
  193. else:
  194. if company_count > 10:
  195. # more results
  196. if company_count > 2000:
  197. company_count = 2000
  198. show_more_result(driver, company_count)
  199. time.sleep(1)
  200. company_list = get_company_from_next_page(driver)
  201. print(len(company_list))
  202. user_list_table.upsert({'url':url,'company_list':' | '.join(company_list[:2000])},['url'])
  203. time.sleep(2)
  204. # except:
  205. # pass
  206. if __name__ == '__main__':
  207. main()