Bläddra i källkod

post-discussion update

Jason 2 år sedan
förälder
incheckning
2bdbdbcd59
2 ändrade filer med 14 tillägg och 9 borttagningar
  1. 11 8
      website_clickjobs/type-1/_clickjob0113.py
  2. 3 1
      website_clickjobs/type-1/hhh_r3.py

+ 11 - 8
website_clickjobs/type-1/_clickjob0113.py

@@ -19,8 +19,6 @@ from selenium.webdriver.common.keys import Keys
 import timeit
 import socket
 
-add_tabs = [7,9,11,13,15,7,9,11,13,15,7,9,11,13,15,7,9,11,13,15]
-
 db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
 driver=None
 headers = {
@@ -71,7 +69,7 @@ def process_query(domain, target_id, brands, query):
     if "sorry" in googleurl:
         return 444
 
-    try:
+    try: #in case there are duplicates...
         clickmore=driver.find_elements("xpath","//p[@class='ofr']/a")
         webdriver.ActionChains(driver).move_to_element(el).perform()
         webdriver.ActionChains(driver).move_to_element(el).click().perform()
@@ -83,10 +81,14 @@ def process_query(domain, target_id, brands, query):
     targets=[]
     targets.append("https://hhh.com.tw/HHH_NEW/designers/index_designerList.php?cid=" + target_id)
     targets.append("https://m.hhh.com.tw/HHH_NEW/designers/index_designerList.php?cid=" + target_id)
-    targets.append("https://www.hhh.com.tw/HHH_NEW/designers/index_designerList.php?cid=" + target_id)
     targets.append("https://hhh.com.tw/HHH_NEW/designers_static/" + target_id + ".html")
     targets.append("https://m.hhh.com.tw/HHH_NEW/designers_static/" + target_id + ".html")
+    targets.append("https://hhh.com.tw/designer-about.php?designer_id=" + target_id + ".html")
+    targets.append("https://m.hhh.com.tw/designer-about.php?designer_id=" + target_id + ".html")
+    
+    targets.append("https://www.hhh.com.tw/HHH_NEW/designers/index_designerList.php?cid=" + target_id)
     targets.append("https://www.hhh.com.tw/HHH_NEW/designers_static/" + target_id + ".html")
+    targets.append("https://www.hhh.com.tw/designer-about.php?designer_id=" + target_id + ".html")
     
     print (len(elmts))
     # driver.save_screenshot('c:/tmp/test.png')
@@ -110,12 +112,13 @@ def process_query(domain, target_id, brands, query):
                     webdriver.ActionChains(driver).move_to_element(el).perform()
                     webdriver.ActionChains(driver).move_to_element(el).click().perform()
                     print("Rank: " + str(n))
-                    time.sleep(15)
+                    duration = random.randint(20,45)
+                    time.sleep(duration)
 
                     print(domain_in_link)
                     return 200
     
-    with open("missing.txt", "a") as file:
+    with open("missing.txt", "a") as file: #if not found
         file.write(target_id + " " + query + "\n")
     
     
@@ -164,6 +167,6 @@ def execute(domain, target_id, brands, target_name):
     if(timetaken < 50):
         extrasleep = 50 - timetaken
     print("Ctrl+C or Ctrl+Z to stop now.")
-    print("You have " + str(30 + extrasleep) + " seconds.")
-    time.sleep(30 + extrasleep)
+    print("You have " + str(10 + extrasleep) + " seconds.")
+    time.sleep(10 + extrasleep)
     return statuscode

+ 3 - 1
website_clickjobs/type-1/hhh_r3.py

@@ -2,10 +2,12 @@ from _clickjob0113 import *
 
 hhhdb = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
 ds=hhhdb.query('select * from hhh.hhh_designer_for_crawl')
+#TAG ABOVE AND UNTAG BELOW FOR DEBUG
+#ds=hhhdb.query('select * from hhh.hhh_designer_for_crawl where hdesigner_id=XXX')
 data=[]
 for i in ds:
     for j in range(i['weight']):
-        data.append([i['hdesigner_id'],i['title'].replace('\xa0','')])
+        data.append([i['hdesigner_id'],i['title'].replace('\xa0',' ')])
 random.shuffle(data)
 
 domain = 'hhh.com.tw'