|
@@ -1,25 +1,9 @@
|
|
|
-#import redis
|
|
|
-import time
|
|
|
-import traceback
|
|
|
-#import json
|
|
|
from selenium import webdriver
|
|
|
-from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
|
import time
|
|
|
import os
|
|
|
-from selenium.webdriver.support.ui import WebDriverWait
|
|
|
-from selenium.webdriver.common.by import By
|
|
|
-from selenium.webdriver.support import expected_conditions as EC
|
|
|
-import dataset
|
|
|
-
|
|
|
-import json
|
|
|
import random
|
|
|
-import time
|
|
|
-import sys
|
|
|
-import codecs
|
|
|
-import random
|
|
|
-import os
|
|
|
-import time
|
|
|
from userAgentRandomizer import userAgents
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
|
|
|
driver=None
|
|
|
|
|
@@ -94,4 +78,15 @@ def run_once(url):
|
|
|
|
|
|
print('exception:',e)
|
|
|
|
|
|
+def resetlst(link):
|
|
|
+ try:
|
|
|
+ if driver is None:
|
|
|
+ re_get_webdriver()
|
|
|
|
|
|
+ driver.get(link)
|
|
|
+ urls = driver.find_elements(By.TAG_NAME,"a")
|
|
|
+ urls = list(filter(lambda url: link in url and ".doc" not in url and ".pdf" not in url and ".odt" not in url and ".docx" not in url, [u.get_attribute('href') for u in urls]))
|
|
|
+ urls.append(link)
|
|
|
+ return urls
|
|
|
+ except:
|
|
|
+ return
|