3 vuotta sitten · ac95553861
--- a/run.py
+++ b/run.py
@@ -0,0 +1,124 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+from selenium import webdriver
			
 
				+from selenium.webdriver.common.by import By
			
 
				+from selenium.webdriver.common.action_chains import ActionChains
			
 
				+from selenium.webdriver.support.wait import WebDriverWait
			
 
				+import time, pickle, sys, os, re, time, requests
			
 
				+import dataset
			
 
				+import pandas as pd
			
 
				+from datetime import datetime, timedelta
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+
			
 
				+def brower_start(port):
			
 
				+    options = webdriver.ChromeOptions()
			
 
				+    browser = webdriver.Remote(
			
 
				+        command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
			
 
				+        desired_capabilities=options.to_capabilities()
			
 
				+    )
			
 
				+    return browser
			
 
				+
			
 
				+
			
 
				+def job_count_parse(x):
			
 
				+    x = x.replace('工作機會(','').replace(')','')
			
 
				+    return int(x)
			
 
				+
			
 
				+
			
 
				+def build_cache(db, table):
			
 
				+    id_dict=[]
			
 
				+    cursor = db.query('SELECT url FROM 104_company.{};'.format(table))
			
 
				+
			
 
				+    for c in cursor:
			
 
				+        id_dict += [c['url']]
			
 
				+    return id_dict
			
 
				+
			
 
				+
			
 
				+def get_next_job(db, table_name, emp):
			
 
				+    result = {}
			
 
				+    result = db.query('SELECT * FROM {} WHERE _status=0 and emp={} ORDER BY RAND() limit 100'\
			
 
				+               .format(table_name, emp))
			
 
				+    url_pd = pd.DataFrame([dict(i) for i in result])
			
 
				+
			
 
				+    return url_pd
			
 
				+
			
 
				+def check_value(x):
			
 
				+    if x == '暫不提供':
			
 
				+        return ''
			
 
				+    else:
			
 
				+        return x   
			
 
				+
			
 
				+def main():
			
 
				+    # script template: python run.py 4446 1 0
			
 
				+    port = 4446
			
 
				+    emp = 1
			
 
				+    # if status = 0, crawler company url list; 
			
 
				+    # elif status = 1, cralwer company contact person;
			
 
				+    status = 0 
			
 
				+    if len(sys.argv) > 1 :
			
 
				+        port=int(sys.argv[1])
			
 
				+        print('restart docker pw{}'.format(port))
			
 
				+        os.system('sudo docker container restart pw'+str(port))
			
 
				+        time.sleep(8)
			
 
				+        emp = int(sys.argv[2])
			
 
				+        status = int(sys.argv[3])
			
 
				+
			
 
				+    current = datetime.today().strftime("%Y/%m/%d")
			
 
				+
			
 
				+    driver = brower_start(port)
			
 
				+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/104_company?charset=utf8mb4')
			
 
				+    url_table = db['url_table']
			
 
				+
			
 
				+    if status == 0:
			
 
				+        for page in range(1, 101):
			
 
				+            print('Page: ', page)
			
 
				+            # find new news url
			
 
				+            id_cache = build_cache(db, 'url_table')
			
 
				+
			
 
				+            # emp=1: 4人以下, emp=2: 5~9人...
			
 
				+            soruce_url = 'https://www.104.com.tw/company/?jobsource=checkc&emp={}&page={}'.format(emp, page)
			
 
				+            driver.get(soruce_url)
			
 
				+            time.sleep(2)
			
 
				+            
			
 
				+            company_list = driver.find_elements_by_class_name('company-list-desktop')
			
 
				+
			
 
				+            for id_, company in enumerate(company_list):
			
 
				+                info_job = company.find_element_by_class_name('info-job')
			
 
				+                info_a = info_job.find_element_by_tag_name("a")
			
 
				+                url = info_a.get_attribute('href')
			
 
				+                job_count = company.find_element_by_xpath(".//a[@data-gtm-list='工作機會']")
			
 
				+
			
 
				+                if url not in id_cache:
			
 
				+                    url_table.insert({
			
 
				+                                'company_name': info_a.text,
			
 
				+                                'url': url,
			
 
				+                                'job_count': job_count_parse(job_count.text),
			
 
				+                                'emp': emp,
			
 
				+                                'crawler_date': current,
			
 
				+                                '_status': 0
			
 
				+                            })
			
 
				+    
			
 
				+    elif status == 1:
			
 
				+        url_pd = get_next_job(db, 'url_table', emp)
			
 
				+        for key, group in url_pd.iterrows():
			
 
				+            url = group['url']
			
 
				+            print(key, group['company_name'], url)
			
 
				+            driver.get(url)
			
 
				+            time.sleep(2)
			
 
				+
			
 
				+            intro_table = driver.find_element_by_class_name('intro-table')
			
 
				+            intro_data = intro_table.find_elements_by_class_name('intro-table__data')
			
 
				+            contact_name = check_value(intro_data[1].text)
			
 
				+            contact_phone = check_value(intro_data[3].text)
			
 
				+
			
 
				+            url_table.upsert({'url':url,
			
 
				+                              '_status':1,
			
 
				+                              'contact_name': contact_name,
			
 
				+                              'contact_phone': contact_phone}
			
 
				+                              ,['url'])
			
 
				+
			
 
				+    db.close()
			
 
				+    driver.close()
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()