Explorar o código

add hhh clickjob (reserved atm)

Jason %!s(int64=2) %!d(string=hai) anos
pai
achega
901d102d2e
Modificáronse 2 ficheiros con 77 adicións e 10 borrados
  1. 27 10
      website_clickjobs/type-1/_clickjob.py
  2. 50 0
      website_clickjobs/type-1/hhh.py

+ 27 - 10
website_clickjobs/type-1/_clickjob.py

@@ -16,6 +16,7 @@ import dataset
 import traceback
 import sys
 from selenium.webdriver.common.keys import Keys
+import timeit
 
 add_tabs = [7,9,11,13,15,7,9,11,13,15,7,9,11,13,15,7,9,11,13,15]
 
@@ -39,7 +40,6 @@ def empty_query(q):
     driver.get(googleurl)
     time.sleep(3)
 
-
 def process_query(domain, target_domain, brands, query):
     print(query)
     sleepoffset = 0
@@ -64,6 +64,9 @@ def process_query(domain, target_domain, brands, query):
     googleurl = driver.current_url
     print(driver.current_url)
 
+    if "sorry" in googleurl:
+        return 444
+
     elmts=driver.find_elements("xpath","//div[@class='yuRUbf']/a")
 
     print (len(elmts))
@@ -72,6 +75,7 @@ def process_query(domain, target_domain, brands, query):
     for el in elmts:
         href=el.get_attribute('href')
         txt=el.text
+        print(href)
         if len(txt)>10:
             if domain in href:
                 domain_in_link += 1
@@ -85,7 +89,7 @@ def process_query(domain, target_domain, brands, query):
                 new_windows_count = add_tabs[random.randint(0,19)]
                 print(str(new_windows_count) + " new tabs")
                 for i in range (0,new_windows_count):
-                    print("Tab" + str(i+1))
+                    print("Tab " + str(i+1))
                     #original_window = driver.current_window_handle
                     #driver.switch_to.new_window('window')
                     #driver.get(href)
@@ -111,6 +115,7 @@ def process_query(domain, target_domain, brands, query):
             
 
     print(domain_in_link)
+    return 200
     
 
 def run_once(domain, target_domain, brands, query):
@@ -129,20 +134,32 @@ def run_once(domain, target_domain, brands, query):
     driver.delete_all_cookies()
     driver.set_window_size(1400,1000)
 
-    process_query(domain, target_domain, brands, query)
-    time.sleep(3)
+    statuscode = process_query(domain, target_domain, brands, query)
     driver.quit()
 
+    return statuscode
+
 #execution starts here
 
 def execute(domain, target_domain, brands, query_list):
     while True:
+        print("Ctrl+C or Ctrl+Z to stop.")
+        st = timeit.default_timer()
         try:
-            run_once(domain, target_domain, brands, random.choice(query_list))
+            statuscode = run_once(domain, target_domain, brands, random.choice(query_list))
         except:
             traceback.print_exc()
-        sleepint = random.randint(75,90)
-        sleepint = sleepint - sleepoffset
-        print("Completed (" + str(sleepint) + ")")
-        if sleepint > 0:
-            time.sleep(sleepint)
+        timetaken = timeit.default_timer()-st
+        print("Time taken: " + str(timetaken))
+        
+        print("Process returned with " + str(statuscode))
+        if statuscode == 444:
+            print("You have been caught!!! Program terminating.")
+            break
+
+        extrasleep = 0
+        if(timetaken < 70):
+            extrasleep = 70 - timetaken
+        print("Ctrl+C or Ctrl+Z to stop now.")
+        print("You have " + str(10 + extrasleep) + " seconds.")
+        time.sleep(10 + extrasleep)

+ 50 - 0
website_clickjobs/type-1/hhh.py

@@ -0,0 +1,50 @@
+from _clickjob import *
+import xmltodict
+
+def get_xml_data(urls):
+    data = []
+    if len(priority)!=0:
+        for p in priority:
+            link = 'https://hhh.com.tw/HHH_NEW/designers_static/' + str(p[0]) + '.html'
+            data.append('https://hhh.com.tw/HHH_NEW/designers_static/' + str(p[0]) + '.html')
+            p[0] = 'https://hhh.com.tw/HHH_NEW/designers_static/' + str(p[0]) + '.html'
+    else:
+        for u in urls:
+            page = requests.get(u)
+            designerdata = xmltodict.parse(page.content)['urlset']['url']
+            for d in designerdata:
+                data.append(d['loc']) 
+
+    if len(data)>=10:
+        print(data[:10])
+    else:
+        print(data)
+    return data
+
+sourceurls = ["https://hhh.com.tw/HHH_NEW/sitemap/sitemap.xml","https://m.hhh.com.tw/HHH_NEW/sitemap/sitemap.xml"]
+
+# PRIORITY IDs, clear list (make it become [] ) to disable priority
+priority = [[54, '虞國綸', '格綸設計'], [793, '許伯争', '樂作空間設計']]
+#priority = [[54, '虞國綸', '格綸設計'], [793, '許伯争', '樂作空間設計'], 32]      # RESERVED -- LEVEL 2
+#priority = [[54, '虞國綸', '格綸設計'], [793, '許伯争', '樂作空間設計'], 32, 331] # RESERVED -- LEVEL 3
+
+domain = 'hhh.com.tw'
+target_domain = get_xml_data(sourceurls)
+brands={domain:'hhh'}
+query_list = ('幸福空間')
+
+while True:
+    
+    if len(priority)!=0:
+        target = random.choice(priority)
+        ql = target[1:]
+        if random.randint(0, 3) % 2 == 1:
+            for q in range (len(ql)):
+                ql[q] = '幸福空間 ' + ql[q]
+        print(ql)        
+        execute(domain, target[0], brands, ql)
+    else:
+        target = random.choice(target_domain)
+        #execute(domain, target_domain, brands, query_list)
+        print(len(target_domain))
+    time.sleep(60)