|
@@ -9,10 +9,11 @@ from selenium.common.exceptions import TimeoutException
|
|
|
from selenium.common.exceptions import WebDriverException
|
|
|
|
|
|
from utility import *
|
|
|
+from const import *
|
|
|
import json
|
|
|
from datetime import datetime
|
|
|
from bs4 import BeautifulSoup
|
|
|
-import time
|
|
|
+import time, os
|
|
|
|
|
|
from tqdm import tqdm
|
|
|
import pandas as pd
|
|
@@ -22,8 +23,8 @@ import configparser
|
|
|
config = configparser.ConfigParser()
|
|
|
config.read('config.ini')
|
|
|
db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/linkedin?charset=utf8mb4')
|
|
|
-company_list_table = db['company']
|
|
|
-user_list_table = db['user']
|
|
|
+company_list_table = db[DB_COMPANY]
|
|
|
+user_list_table = db[DB_USER]
|
|
|
|
|
|
|
|
|
def show_more_result(driver, company_count):
|
|
@@ -134,6 +135,9 @@ def main():
|
|
|
driver = serive_create()
|
|
|
else:
|
|
|
print('linux web start')
|
|
|
+ print('restart docker p{}'.format(args.port))
|
|
|
+ os.system('sudo docker container restart p'+str(args.port))
|
|
|
+ time.sleep(8)
|
|
|
driver = brower_start(args.port)
|
|
|
url = 'https://www.linkedin.com/login'
|
|
|
driver.get(url)
|
|
@@ -143,13 +147,13 @@ def main():
|
|
|
time.sleep(2)
|
|
|
check_page(driver)
|
|
|
|
|
|
- result = db.query(f"SELECT * FROM user where company_list is null ORDER BY RAND() limit {args.limit_count}")
|
|
|
+ result = db.query(f"SELECT * FROM {DB_USER} where company_list is null ORDER BY RAND() limit {args.limit_count}")
|
|
|
result = pd.DataFrame([dict(i) for i in result])
|
|
|
|
|
|
# try:
|
|
|
print('start to crawler...')
|
|
|
for k, r in result.iterrows():
|
|
|
- company_url_list = check_duplicate('company', 'company_url', db)
|
|
|
+ company_url_list = check_duplicate(DB_COMPANY, 'company_url', db)
|
|
|
|
|
|
url = r['url']
|
|
|
driver.get(url)
|