# -*- coding: utf-8 -*- from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.support.wait import WebDriverWait import time, pickle, sys, os, re, time, requests import dataset import pandas as pd from datetime import datetime, timedelta from bs4 import BeautifulSoup def brower_start(port): options = webdriver.ChromeOptions() browser = webdriver.Remote( command_executor='http://127.0.0.1:'+str(port)+'/wd/hub', desired_capabilities=options.to_capabilities() ) return browser def job_count_parse(x): x = x.replace('工作機會(','').replace(')','') return int(x) def build_cache(db, table): id_dict=[] cursor = db.query('SELECT url FROM 104_company.{};'.format(table)) for c in cursor: id_dict += [c['url']] return id_dict def get_next_job(db, table_name, emp): result = {} result = db.query('SELECT * FROM {} WHERE _status=0 and emp={} ORDER BY RAND() limit 100'\ .format(table_name, emp)) url_pd = pd.DataFrame([dict(i) for i in result]) return url_pd def check_value(x): if x == '暫不提供': return '' else: return x def main(): # script template: python run.py 4446 1 0 port = 4446 emp = 1 # if status = 0, crawler company url list; # elif status = 1, cralwer company contact person; status = 0 if len(sys.argv) > 1 : port=int(sys.argv[1]) print('restart docker pw{}'.format(port)) os.system('sudo docker container restart pw'+str(port)) time.sleep(8) emp = int(sys.argv[2]) status = int(sys.argv[3]) current = datetime.today().strftime("%Y/%m/%d") driver = brower_start(port) db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/104_company?charset=utf8mb4') url_table = db['url_table'] if status == 0: for page in range(1, 101): print('Page: ', page) # find new news url id_cache = build_cache(db, 'url_table') # emp=1: 4人以下, emp=2: 5~9人... soruce_url = 'https://www.104.com.tw/company/?jobsource=checkc&emp={}&page={}'.format(emp, page) driver.get(soruce_url) time.sleep(2) company_list = driver.find_elements_by_class_name('company-list-desktop') for id_, company in enumerate(company_list): info_job = company.find_element_by_class_name('info-job') info_a = info_job.find_element_by_tag_name("a") url = info_a.get_attribute('href') job_count = company.find_element_by_xpath(".//a[@data-gtm-list='工作機會']") if url not in id_cache: url_table.insert({ 'company_name': info_a.text, 'url': url, 'job_count': job_count_parse(job_count.text), 'emp': emp, 'crawler_date': current, '_status': 0 }) elif status == 1: url_pd = get_next_job(db, 'url_table', emp) for key, group in url_pd.iterrows(): url = group['url'] print(key, group['company_name'], url) driver.get(url) time.sleep(2) intro_table = driver.find_element_by_class_name('intro-table') intro_data = intro_table.find_elements_by_class_name('intro-table__data') contact_name = check_value(intro_data[1].text) contact_phone = check_value(intro_data[3].text) url_table.upsert({'url':url, '_status':1, 'contact_name': contact_name, 'contact_phone': contact_phone} ,['url']) db.close() driver.close() if __name__ == "__main__": main()