noodles 1 년 전
부모
커밋
3c663a8203
3개의 변경된 파일17개의 추가작업 그리고 9개의 파일을 삭제
  1. 3 0
      const.py
  2. 5 4
      get_person_list.py
  3. 9 5
      person_interest.py

+ 3 - 0
const.py

@@ -0,0 +1,3 @@
+DB_USER = 'user2'
+DB_COMPANY = 'company2'
+DB_POST = 'post2'

+ 5 - 4
get_person_list.py

@@ -9,6 +9,7 @@ from selenium.common.exceptions import TimeoutException
 from selenium.common.exceptions import WebDriverException
 
 from utility import *
+from const import *
 import json
 from datetime import datetime
 from bs4 import BeautifulSoup
@@ -22,10 +23,10 @@ import configparser
 config = configparser.ConfigParser()
 config.read('config.ini')
 db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/linkedin?charset=utf8mb4')
-post_list_table = db['post2']
-user_list_table = db['user']
-user_url_list = check_duplicate('user', 'url', db)
-url_list = check_duplicate('post2', 'url', db)
+post_list_table = db[DB_POST]
+user_list_table = db[DB_USER]
+user_url_list = check_duplicate(DB_USER, 'url', db)
+url_list = check_duplicate(DB_POST, 'url', db)
 
 
 def get_content_info(driver):

+ 9 - 5
person_interest.py

@@ -9,10 +9,11 @@ from selenium.common.exceptions import TimeoutException
 from selenium.common.exceptions import WebDriverException
 
 from utility import *
+from const import *
 import json
 from datetime import datetime
 from bs4 import BeautifulSoup
-import time
+import time, os
 
 from tqdm import tqdm
 import pandas as pd
@@ -22,8 +23,8 @@ import configparser
 config = configparser.ConfigParser()
 config.read('config.ini')
 db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/linkedin?charset=utf8mb4')
-company_list_table = db['company']
-user_list_table = db['user']
+company_list_table = db[DB_COMPANY]
+user_list_table = db[DB_USER]
 
 
 def show_more_result(driver, company_count):
@@ -134,6 +135,9 @@ def main():
         driver = serive_create()
     else:
         print('linux web start')
+        print('restart docker p{}'.format(args.port))
+        os.system('sudo docker container restart p'+str(args.port))
+        time.sleep(8)
         driver = brower_start(args.port)
     url = 'https://www.linkedin.com/login'
     driver.get(url)
@@ -143,13 +147,13 @@ def main():
     time.sleep(2)
     check_page(driver)
 
-    result = db.query(f"SELECT * FROM user where company_list is null ORDER BY RAND() limit {args.limit_count}")
+    result = db.query(f"SELECT * FROM {DB_USER} where company_list is null ORDER BY RAND() limit {args.limit_count}")
     result = pd.DataFrame([dict(i) for i in result])
 
     # try:
     print('start to crawler...')
     for k, r in result.iterrows():
-        company_url_list = check_duplicate('company', 'company_url', db)
+        company_url_list = check_duplicate(DB_COMPANY, 'company_url', db)
         
         url = r['url']
         driver.get(url)