Jason hai 1 ano
pai
achega
f0d08bbdbc
Modificáronse 2 ficheiros con 14 adicións e 21 borrados
  1. 12 17
      website_clickjobs/type-R/_clickjob.py
  2. 2 4
      website_clickjobs/type-R/tcd.py

+ 12 - 17
website_clickjobs/type-R/_clickjob.py

@@ -1,25 +1,9 @@
-#import redis
-import time
-import traceback
-#import json
 from selenium import webdriver
-from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 import time
 import os
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support import expected_conditions as EC
-import dataset
-
-import json
 import random
-import time
-import sys
-import codecs
-import random
-import os
-import time
 from userAgentRandomizer import userAgents
+from selenium.webdriver.common.by import By
 
 driver=None
 
@@ -94,4 +78,15 @@ def run_once(url):
 
         print('exception:',e)
 
+def resetlst(link):
+    try:
+        if driver is None:
+            re_get_webdriver()
 
+        driver.get(link)
+        urls = driver.find_elements(By.TAG_NAME,"a")
+        urls = list(filter(lambda url: link in url and ".doc" not in url and ".pdf" not in url and ".odt" not in url and ".docx" not in url, [u.get_attribute('href') for u in urls]))
+        urls.append(link)
+        return urls
+    except:
+        return

+ 2 - 4
website_clickjobs/type-R/tcd.py

@@ -1,13 +1,11 @@
 from _clickjob import *
 
-def resetlst():
-    return ['https://www.tcdream.taichung.gov.tw/']
-
 lst=[]
+link='https://www.tcdream.taichung.gov.tw/'
 
 #db = dataset.connect('DATABASE LINK HERE')
 
-lst=resetlst()
+lst=resetlst(link)
 
 while True:
     try: