# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.wait import WebDriverWait
import time, pickle, sys, os, re, time, requests
import dataset
import pandas as pd
from datetime import datetime, timedelta
from bs4 import BeautifulSoup


def brower_start(port):
    options = webdriver.ChromeOptions()
    browser = webdriver.Remote(
        command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
        desired_capabilities=options.to_capabilities()
    )
    return browser


def job_count_parse(x):
    x = x.replace('工作機會(','').replace(')','')
    return int(x)


def build_cache(db, table):
    id_dict=[]
    cursor = db.query('SELECT url FROM 104_company.{};'.format(table))

    for c in cursor:
        id_dict += [c['url']]
    return id_dict


def get_next_job(db, table_name, emp):
    result = {}
    result = db.query('SELECT * FROM {} WHERE _status=0 and emp={} ORDER BY RAND() limit 100'\
               .format(table_name, emp))
    url_pd = pd.DataFrame([dict(i) for i in result])

    return url_pd

def check_value(x):
    if x == '暫不提供':
        return ''
    else:
        return x   

def main():
    # script template: python run.py 4446 1 0
    port = 4446
    emp = 1
    # if status = 0, crawler company url list; 
    # elif status = 1, cralwer company contact person;
    status = 0 
    if len(sys.argv) > 1 :
        port=int(sys.argv[1])
        print('restart docker pw{}'.format(port))
        os.system('sudo docker container restart pw'+str(port))
        time.sleep(8)
        emp = int(sys.argv[2])
        status = int(sys.argv[3])

    current = datetime.today().strftime("%Y/%m/%d")

    driver = brower_start(port)
    db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/104_company?charset=utf8mb4')
    url_table = db['url_table']

    if status == 0:
        for page in range(1, 101):
            print('Page: ', page)
            # find new news url
            id_cache = build_cache(db, 'url_table')

            # emp=1: 4人以下, emp=2: 5~9人...
            soruce_url = 'https://www.104.com.tw/company/?jobsource=checkc&emp={}&page={}'.format(emp, page)
            driver.get(soruce_url)
            time.sleep(2)
            
            company_list = driver.find_elements_by_class_name('company-list-desktop')

            for id_, company in enumerate(company_list):
                info_job = company.find_element_by_class_name('info-job')
                info_a = info_job.find_element_by_tag_name("a")
                url = info_a.get_attribute('href')
                job_count = company.find_element_by_xpath(".//a[@data-gtm-list='工作機會']")

                if url not in id_cache:
                    url_table.insert({
                                'company_name': info_a.text,
                                'url': url,
                                'job_count': job_count_parse(job_count.text),
                                'emp': emp,
                                'crawler_date': current,
                                '_status': 0
                            })
    
    elif status == 1:
        url_pd = get_next_job(db, 'url_table', emp)
        for key, group in url_pd.iterrows():
            url = group['url']
            print(key, group['company_name'], url)
            driver.get(url)
            time.sleep(2)

            intro_table = driver.find_element_by_class_name('intro-table')
            intro_data = intro_table.find_elements_by_class_name('intro-table__data')
            contact_name = check_value(intro_data[1].text)
            contact_phone = check_value(intro_data[3].text)

            url_table.upsert({'url':url,
                              '_status':1,
                              'contact_name': contact_name,
                              'contact_phone': contact_phone}
                              ,['url'])

    db.close()
    driver.close()


if __name__ == "__main__":
    main()