noodles 2 năm trước cách đây
mục cha
commit
ac95553861
1 tập tin đã thay đổi với 124 bổ sung0 xóa
  1. 124 0
      run.py

+ 124 - 0
run.py

@@ -0,0 +1,124 @@
+# -*- coding: utf-8 -*-
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.webdriver.support.wait import WebDriverWait
+import time, pickle, sys, os, re, time, requests
+import dataset
+import pandas as pd
+from datetime import datetime, timedelta
+from bs4 import BeautifulSoup
+
+
+def brower_start(port):
+    options = webdriver.ChromeOptions()
+    browser = webdriver.Remote(
+        command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
+        desired_capabilities=options.to_capabilities()
+    )
+    return browser
+
+
+def job_count_parse(x):
+    x = x.replace('工作機會(','').replace(')','')
+    return int(x)
+
+
+def build_cache(db, table):
+    id_dict=[]
+    cursor = db.query('SELECT url FROM 104_company.{};'.format(table))
+
+    for c in cursor:
+        id_dict += [c['url']]
+    return id_dict
+
+
+def get_next_job(db, table_name, emp):
+    result = {}
+    result = db.query('SELECT * FROM {} WHERE _status=0 and emp={} ORDER BY RAND() limit 100'\
+               .format(table_name, emp))
+    url_pd = pd.DataFrame([dict(i) for i in result])
+
+    return url_pd
+
+def check_value(x):
+    if x == '暫不提供':
+        return ''
+    else:
+        return x   
+
+def main():
+    # script template: python run.py 4446 1 0
+    port = 4446
+    emp = 1
+    # if status = 0, crawler company url list; 
+    # elif status = 1, cralwer company contact person;
+    status = 0 
+    if len(sys.argv) > 1 :
+        port=int(sys.argv[1])
+        print('restart docker pw{}'.format(port))
+        os.system('sudo docker container restart pw'+str(port))
+        time.sleep(8)
+        emp = int(sys.argv[2])
+        status = int(sys.argv[3])
+
+    current = datetime.today().strftime("%Y/%m/%d")
+
+    driver = brower_start(port)
+    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/104_company?charset=utf8mb4')
+    url_table = db['url_table']
+
+    if status == 0:
+        for page in range(1, 101):
+            print('Page: ', page)
+            # find new news url
+            id_cache = build_cache(db, 'url_table')
+
+            # emp=1: 4人以下, emp=2: 5~9人...
+            soruce_url = 'https://www.104.com.tw/company/?jobsource=checkc&emp={}&page={}'.format(emp, page)
+            driver.get(soruce_url)
+            time.sleep(2)
+            
+            company_list = driver.find_elements_by_class_name('company-list-desktop')
+
+            for id_, company in enumerate(company_list):
+                info_job = company.find_element_by_class_name('info-job')
+                info_a = info_job.find_element_by_tag_name("a")
+                url = info_a.get_attribute('href')
+                job_count = company.find_element_by_xpath(".//a[@data-gtm-list='工作機會']")
+
+                if url not in id_cache:
+                    url_table.insert({
+                                'company_name': info_a.text,
+                                'url': url,
+                                'job_count': job_count_parse(job_count.text),
+                                'emp': emp,
+                                'crawler_date': current,
+                                '_status': 0
+                            })
+    
+    elif status == 1:
+        url_pd = get_next_job(db, 'url_table', emp)
+        for key, group in url_pd.iterrows():
+            url = group['url']
+            print(key, group['company_name'], url)
+            driver.get(url)
+            time.sleep(2)
+
+            intro_table = driver.find_element_by_class_name('intro-table')
+            intro_data = intro_table.find_elements_by_class_name('intro-table__data')
+            contact_name = check_value(intro_data[1].text)
+            contact_phone = check_value(intro_data[3].text)
+
+            url_table.upsert({'url':url,
+                              '_status':1,
+                              'contact_name': contact_name,
+                              'contact_phone': contact_phone}
+                              ,['url'])
+
+    db.close()
+    driver.close()
+
+
+if __name__ == "__main__":
+    main()