|
@@ -0,0 +1,124 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+from selenium import webdriver
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
+from selenium.webdriver.common.action_chains import ActionChains
|
|
|
+from selenium.webdriver.support.wait import WebDriverWait
|
|
|
+import time, pickle, sys, os, re, time, requests
|
|
|
+import dataset
|
|
|
+import pandas as pd
|
|
|
+from datetime import datetime, timedelta
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+
|
|
|
+
|
|
|
+def brower_start(port):
|
|
|
+ options = webdriver.ChromeOptions()
|
|
|
+ browser = webdriver.Remote(
|
|
|
+ command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
|
|
|
+ desired_capabilities=options.to_capabilities()
|
|
|
+ )
|
|
|
+ return browser
|
|
|
+
|
|
|
+
|
|
|
+def job_count_parse(x):
|
|
|
+ x = x.replace('工作機會(','').replace(')','')
|
|
|
+ return int(x)
|
|
|
+
|
|
|
+
|
|
|
+def build_cache(db, table):
|
|
|
+ id_dict=[]
|
|
|
+ cursor = db.query('SELECT url FROM 104_company.{};'.format(table))
|
|
|
+
|
|
|
+ for c in cursor:
|
|
|
+ id_dict += [c['url']]
|
|
|
+ return id_dict
|
|
|
+
|
|
|
+
|
|
|
+def get_next_job(db, table_name, emp):
|
|
|
+ result = {}
|
|
|
+ result = db.query('SELECT * FROM {} WHERE _status=0 and emp={} ORDER BY RAND() limit 100'\
|
|
|
+ .format(table_name, emp))
|
|
|
+ url_pd = pd.DataFrame([dict(i) for i in result])
|
|
|
+
|
|
|
+ return url_pd
|
|
|
+
|
|
|
+def check_value(x):
|
|
|
+ if x == '暫不提供':
|
|
|
+ return ''
|
|
|
+ else:
|
|
|
+ return x
|
|
|
+
|
|
|
+def main():
|
|
|
+ # script template: python run.py 4446 1 0
|
|
|
+ port = 4446
|
|
|
+ emp = 1
|
|
|
+ # if status = 0, crawler company url list;
|
|
|
+ # elif status = 1, cralwer company contact person;
|
|
|
+ status = 0
|
|
|
+ if len(sys.argv) > 1 :
|
|
|
+ port=int(sys.argv[1])
|
|
|
+ print('restart docker pw{}'.format(port))
|
|
|
+ os.system('sudo docker container restart pw'+str(port))
|
|
|
+ time.sleep(8)
|
|
|
+ emp = int(sys.argv[2])
|
|
|
+ status = int(sys.argv[3])
|
|
|
+
|
|
|
+ current = datetime.today().strftime("%Y/%m/%d")
|
|
|
+
|
|
|
+ driver = brower_start(port)
|
|
|
+ db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/104_company?charset=utf8mb4')
|
|
|
+ url_table = db['url_table']
|
|
|
+
|
|
|
+ if status == 0:
|
|
|
+ for page in range(1, 101):
|
|
|
+ print('Page: ', page)
|
|
|
+ # find new news url
|
|
|
+ id_cache = build_cache(db, 'url_table')
|
|
|
+
|
|
|
+ # emp=1: 4人以下, emp=2: 5~9人...
|
|
|
+ soruce_url = 'https://www.104.com.tw/company/?jobsource=checkc&emp={}&page={}'.format(emp, page)
|
|
|
+ driver.get(soruce_url)
|
|
|
+ time.sleep(2)
|
|
|
+
|
|
|
+ company_list = driver.find_elements_by_class_name('company-list-desktop')
|
|
|
+
|
|
|
+ for id_, company in enumerate(company_list):
|
|
|
+ info_job = company.find_element_by_class_name('info-job')
|
|
|
+ info_a = info_job.find_element_by_tag_name("a")
|
|
|
+ url = info_a.get_attribute('href')
|
|
|
+ job_count = company.find_element_by_xpath(".//a[@data-gtm-list='工作機會']")
|
|
|
+
|
|
|
+ if url not in id_cache:
|
|
|
+ url_table.insert({
|
|
|
+ 'company_name': info_a.text,
|
|
|
+ 'url': url,
|
|
|
+ 'job_count': job_count_parse(job_count.text),
|
|
|
+ 'emp': emp,
|
|
|
+ 'crawler_date': current,
|
|
|
+ '_status': 0
|
|
|
+ })
|
|
|
+
|
|
|
+ elif status == 1:
|
|
|
+ url_pd = get_next_job(db, 'url_table', emp)
|
|
|
+ for key, group in url_pd.iterrows():
|
|
|
+ url = group['url']
|
|
|
+ print(key, group['company_name'], url)
|
|
|
+ driver.get(url)
|
|
|
+ time.sleep(2)
|
|
|
+
|
|
|
+ intro_table = driver.find_element_by_class_name('intro-table')
|
|
|
+ intro_data = intro_table.find_elements_by_class_name('intro-table__data')
|
|
|
+ contact_name = check_value(intro_data[1].text)
|
|
|
+ contact_phone = check_value(intro_data[3].text)
|
|
|
+
|
|
|
+ url_table.upsert({'url':url,
|
|
|
+ '_status':1,
|
|
|
+ 'contact_name': contact_name,
|
|
|
+ 'contact_phone': contact_phone}
|
|
|
+ ,['url'])
|
|
|
+
|
|
|
+ db.close()
|
|
|
+ driver.close()
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|