123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124 |
- # -*- coding: utf-8 -*-
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- from selenium.webdriver.common.action_chains import ActionChains
- from selenium.webdriver.support.wait import WebDriverWait
- import time, pickle, sys, os, re, time, requests
- import dataset
- import pandas as pd
- from datetime import datetime, timedelta
- from bs4 import BeautifulSoup
- def brower_start(port):
- options = webdriver.ChromeOptions()
- browser = webdriver.Remote(
- command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
- desired_capabilities=options.to_capabilities()
- )
- return browser
- def job_count_parse(x):
- x = x.replace('工作機會(','').replace(')','')
- return int(x)
- def build_cache(db, table):
- id_dict=[]
- cursor = db.query('SELECT url FROM 104_company.{};'.format(table))
- for c in cursor:
- id_dict += [c['url']]
- return id_dict
- def get_next_job(db, table_name, emp):
- result = {}
- result = db.query('SELECT * FROM {} WHERE _status=0 and emp={} ORDER BY RAND() limit 100'\
- .format(table_name, emp))
- url_pd = pd.DataFrame([dict(i) for i in result])
- return url_pd
- def check_value(x):
- if x == '暫不提供':
- return ''
- else:
- return x
- def main():
- # script template: python run.py 4446 1 0
- port = 4446
- emp = 1
- # if status = 0, crawler company url list;
- # elif status = 1, cralwer company contact person;
- status = 0
- if len(sys.argv) > 1 :
- port=int(sys.argv[1])
- print('restart docker pw{}'.format(port))
- os.system('sudo docker container restart pw'+str(port))
- time.sleep(8)
- emp = int(sys.argv[2])
- status = int(sys.argv[3])
- current = datetime.today().strftime("%Y/%m/%d")
- driver = brower_start(port)
- db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/104_company?charset=utf8mb4')
- url_table = db['url_table']
- if status == 0:
- for page in range(1, 101):
- print('Page: ', page)
- # find new news url
- id_cache = build_cache(db, 'url_table')
- # emp=1: 4人以下, emp=2: 5~9人...
- soruce_url = 'https://www.104.com.tw/company/?jobsource=checkc&emp={}&page={}'.format(emp, page)
- driver.get(soruce_url)
- time.sleep(2)
-
- company_list = driver.find_elements_by_class_name('company-list-desktop')
- for id_, company in enumerate(company_list):
- info_job = company.find_element_by_class_name('info-job')
- info_a = info_job.find_element_by_tag_name("a")
- url = info_a.get_attribute('href')
- job_count = company.find_element_by_xpath(".//a[@data-gtm-list='工作機會']")
- if url not in id_cache:
- url_table.insert({
- 'company_name': info_a.text,
- 'url': url,
- 'job_count': job_count_parse(job_count.text),
- 'emp': emp,
- 'crawler_date': current,
- '_status': 0
- })
-
- elif status == 1:
- url_pd = get_next_job(db, 'url_table', emp)
- for key, group in url_pd.iterrows():
- url = group['url']
- print(key, group['company_name'], url)
- driver.get(url)
- time.sleep(2)
- intro_table = driver.find_element_by_class_name('intro-table')
- intro_data = intro_table.find_elements_by_class_name('intro-table__data')
- contact_name = check_value(intro_data[1].text)
- contact_phone = check_value(intro_data[3].text)
- url_table.upsert({'url':url,
- '_status':1,
- 'contact_name': contact_name,
- 'contact_phone': contact_phone}
- ,['url'])
- db.close()
- driver.close()
- if __name__ == "__main__":
- main()
|