Jason 2 år sedan
förälder
incheckning
de9fe51b03
2 ändrade filer med 19 tillägg och 15 borttagningar
  1. 14 11
      website_clickjobs/type-1/_clickjob0113.py
  2. 5 4
      website_clickjobs/type-1/hhh_r3.py

+ 14 - 11
website_clickjobs/type-1/_clickjob0113.py

@@ -38,7 +38,7 @@ def empty_query(q):
     driver.get(googleurl)
     time.sleep(3)
 
-def process_query(domain, target_id, brands, query):
+def process_query(domain, target_id, url, query):
     print(target_id)
     sleepoffset = 0
     global driver
@@ -78,16 +78,19 @@ def process_query(domain, target_id, brands, query):
     elmts=driver.find_elements("xpath","//div[@class='yuRUbf']/a")
 
     targets=[]
-    targets.append("https://hhh.com.tw/HHH_NEW/designers/index_designerList.php?cid=" + target_id)
-    targets.append("https://m.hhh.com.tw/HHH_NEW/designers/index_designerList.php?cid=" + target_id)
-    targets.append("https://hhh.com.tw/HHH_NEW/designers_static/" + target_id + ".html")
-    targets.append("https://m.hhh.com.tw/HHH_NEW/designers_static/" + target_id + ".html")
-    targets.append("https://hhh.com.tw/designer-about.php?designer_id=" + target_id + ".html")
-    targets.append("https://m.hhh.com.tw/designer-about.php?designer_id=" + target_id + ".html")
-    
-    targets.append("https://www.hhh.com.tw/HHH_NEW/designers/index_designerList.php?cid=" + target_id)
-    targets.append("https://www.hhh.com.tw/HHH_NEW/designers_static/" + target_id + ".html")
-    targets.append("https://www.hhh.com.tw/designer-about.php?designer_id=" + target_id + ".html")
+    if int(target_id) > 50000: #50XXX = FAQ
+        targets.append(url)
+    else:
+        targets.append("https://hhh.com.tw/HHH_NEW/designers/index_designerList.php?cid=" + target_id)
+        targets.append("https://m.hhh.com.tw/HHH_NEW/designers/index_designerList.php?cid=" + target_id)
+        targets.append("https://hhh.com.tw/HHH_NEW/designers_static/" + target_id + ".html")
+        targets.append("https://m.hhh.com.tw/HHH_NEW/designers_static/" + target_id + ".html")
+        targets.append("https://hhh.com.tw/designer-about.php?designer_id=" + target_id + ".html")
+        targets.append("https://m.hhh.com.tw/designer-about.php?designer_id=" + target_id + ".html")
+        
+        targets.append("https://www.hhh.com.tw/HHH_NEW/designers/index_designerList.php?cid=" + target_id)
+        targets.append("https://www.hhh.com.tw/HHH_NEW/designers_static/" + target_id + ".html")
+        targets.append("https://www.hhh.com.tw/designer-about.php?designer_id=" + target_id + ".html")
     
     print (len(elmts))
     # driver.save_screenshot('c:/tmp/test.png')

+ 5 - 4
website_clickjobs/type-1/hhh_r3.py

@@ -5,17 +5,17 @@ hhhdb = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf
 def reset():
     ds=hhhdb.query('select * from hhh.hhh_designer_for_crawl')
     #TAG ABOVE AND UNTAG BELOW FOR DEBUG
-    #ds=hhhdb.query('select * from hhh.hhh_designer_for_crawl where hdesigner_id=XXX')
+    #ds=hhhdb.query('select * from hhh.hhh_designer_for_crawl where hdesigner_id=')
     data=[]
     for i in ds:
         for j in range(i['weight']):
-            data.append([i['hdesigner_id'],i['title'].replace('\xa0',' ')])
+            data.append([i['hdesigner_id'],i['title'].replace('\xa0',' '),i['url']])
     random.shuffle(data)
     return data
 
 domain = 'hhh.com.tw'
 #target_domain = get_xml_data(sourceurls)
-brands={domain:'hhh'}
+brands={domain:'hhh'} # UNUSED
 #query_list = ['幸福空間']
 
 #data = getdata()
@@ -36,9 +36,10 @@ while True:
             target = random.choice(data) # START HERE
             target_id = str(target[0])
             target_name = target[1]
+            target_url = target[2]
             print(target_id)
             print(target_name)
-            statuscode = execute(domain, target_id, brands, target_name)
+            statuscode = execute(domain, target_id, target_url, target_name)
             if statuscode == 444:
                 print("Completed ", runcount, " times before being caught")
                 time.sleep(300)