run.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. # -*- coding: utf-8 -*-
  2. from selenium import webdriver
  3. from selenium.webdriver.common.by import By
  4. from selenium.webdriver.common.action_chains import ActionChains
  5. from selenium.webdriver.support.wait import WebDriverWait
  6. import time, pickle, sys, os, re, time, requests
  7. import dataset
  8. import pandas as pd
  9. from datetime import datetime, timedelta
  10. from bs4 import BeautifulSoup
  11. def brower_start(port):
  12. options = webdriver.ChromeOptions()
  13. browser = webdriver.Remote(
  14. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  15. desired_capabilities=options.to_capabilities()
  16. )
  17. return browser
  18. def job_count_parse(x):
  19. x = x.replace('工作機會(','').replace(')','')
  20. return int(x)
  21. def build_cache(db, table):
  22. id_dict=[]
  23. cursor = db.query('SELECT url FROM 104_company.{};'.format(table))
  24. for c in cursor:
  25. id_dict += [c['url']]
  26. return id_dict
  27. def get_next_job(db, table_name, emp):
  28. result = {}
  29. result = db.query('SELECT * FROM {} WHERE _status=0 and emp={} ORDER BY RAND() limit 100'\
  30. .format(table_name, emp))
  31. url_pd = pd.DataFrame([dict(i) for i in result])
  32. return url_pd
  33. def check_value(x):
  34. if x == '暫不提供':
  35. return ''
  36. else:
  37. return x
  38. def main():
  39. # script template: python run.py 4446 1 0
  40. port = 4446
  41. emp = 1
  42. # if status = 0, crawler company url list;
  43. # elif status = 1, cralwer company contact person;
  44. status = 0
  45. if len(sys.argv) > 1 :
  46. port=int(sys.argv[1])
  47. print('restart docker pw{}'.format(port))
  48. os.system('sudo docker container restart pw'+str(port))
  49. time.sleep(8)
  50. emp = int(sys.argv[2])
  51. status = int(sys.argv[3])
  52. current = datetime.today().strftime("%Y/%m/%d")
  53. driver = brower_start(port)
  54. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/104_company?charset=utf8mb4')
  55. url_table = db['url_table']
  56. if status == 0:
  57. for page in range(1, 101):
  58. print('Page: ', page)
  59. # find new news url
  60. id_cache = build_cache(db, 'url_table')
  61. # emp=1: 4人以下, emp=2: 5~9人...
  62. soruce_url = 'https://www.104.com.tw/company/?jobsource=checkc&emp={}&page={}'.format(emp, page)
  63. driver.get(soruce_url)
  64. time.sleep(2)
  65. company_list = driver.find_elements_by_class_name('company-list-desktop')
  66. for id_, company in enumerate(company_list):
  67. info_job = company.find_element_by_class_name('info-job')
  68. info_a = info_job.find_element_by_tag_name("a")
  69. url = info_a.get_attribute('href')
  70. job_count = company.find_element_by_xpath(".//a[@data-gtm-list='工作機會']")
  71. if url not in id_cache:
  72. url_table.insert({
  73. 'company_name': info_a.text,
  74. 'url': url,
  75. 'job_count': job_count_parse(job_count.text),
  76. 'emp': emp,
  77. 'crawler_date': current,
  78. '_status': 0
  79. })
  80. elif status == 1:
  81. url_pd = get_next_job(db, 'url_table', emp)
  82. for key, group in url_pd.iterrows():
  83. url = group['url']
  84. print(key, group['company_name'], url)
  85. driver.get(url)
  86. time.sleep(2)
  87. intro_table = driver.find_element_by_class_name('intro-table')
  88. intro_data = intro_table.find_elements_by_class_name('intro-table__data')
  89. contact_name = check_value(intro_data[1].text)
  90. contact_phone = check_value(intro_data[3].text)
  91. url_table.upsert({'url':url,
  92. '_status':1,
  93. 'contact_name': contact_name,
  94. 'contact_phone': contact_phone}
  95. ,['url'])
  96. db.close()
  97. driver.close()
  98. if __name__ == "__main__":
  99. main()