瀏覽代碼

new program

Jason 1 年之前
父節點
當前提交
e88de8431e
共有 1 個文件被更改,包括 301 次插入0 次删除
  1. 301 0
      website_clickjobs/gen_seo2b.py

+ 301 - 0
website_clickjobs/gen_seo2b.py

@@ -0,0 +1,301 @@
+#import redis
+import time
+import traceback
+#import json
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+import time
+import os
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+import dataset
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.chrome.service import Service
+import json
+import random
+import time
+import datetime
+import sys
+import codecs
+import random
+import os
+import time
+import requests
+import pymysql
+import urllib.parse
+import multiprocessing
+pymysql.install_as_MySQLdb()
+from userAgentRandomizer import userAgents
+driver=None
+driverclosed = 0
+
+
+headers = {
+        "Authorization": "Bearer " + "6SDULL1Ebklduc6TFxa97AFto5Sj21kyJ30CxiLiSoi",
+        "Content-Type": "application/x-www-form-urlencoded"
+}
+
+def send_msg(kw):
+    params = {"message":kw}  
+    r = requests.post("https://notify-api.line.me/api/notify",headers=headers, params=params)
+
+blacklist = ['https://www.chinatimes.com/realtimenews/20220613003142-260402','https://ipo168.pixnet.net/blog/post/207626239-%E5%95%9F%E7%BF%94%E8%BC%95%E9%87%91%E5%B1%AC%E7%A7%91%E6%8A%80%E8%82%A1%E7%A5%A8%E6%98%AF%E9%80%99%E6%A8%A3%E7%9A%84%E5%85%AC%E5%8F%B8%21%21%E6%8A%95%E8%B3%87%E5%89%8D%E8%A6%81','https://latest.mediatagtw.com/article/%e5%95%9f%e7%bf%94%e8%bc%95%e9%87%91%e5%b1%ac%e7%a7%91%e6%8a%80%e8%82%a1%e4%bb%bd%e6%9c%89%e9%99%90%e5%85%ac%e5%8f%b8#gsc.tab=0']
+
+
+def re_get_webdriver():
+    global port
+    global driver
+    result=[]
+    if driver is not None:
+        print('closing....')
+        driver.quit()
+        os.system('killall chrome')
+        print('quit....')
+        driver=None
+    try:
+        options = webdriver.ChromeOptions()
+        # options.add_argument("user-agent=%s" % user_agent)
+        options.add_argument('--headless')
+        options.add_argument("--incognito")
+        driver = webdriver.Chrome(options=options)
+        driver.delete_all_cookies()
+        driver.set_window_size(1400,1000)
+    except:
+        traceback.print_exc()
+        driver=None
+        return None
+
+def getDriver():
+    ua=userAgents().random()
+    options = webdriver.ChromeOptions()
+    #print(ua)
+    #options.add_argument("user-agent="+ua)
+    options.add_argument('--headless')
+    options.add_argument('--incognito')
+    options.add_argument('--no-sandbox')
+    driver=webdriver.Chrome(options=options)
+    driver.set_window_size(1400,1000)
+    return driver
+
+
+def run_once(jsobj,db):
+
+    table=db['nda_log']
+    print(jsobj)
+    global driver
+
+    driverclosed=0
+
+#    i=random.randint(0,9)
+    i=100
+    driver=getDriver()
+    try:
+        kw=jsobj['kw']
+        if jsobj.get('domain') is None:
+            exclude=jsobj['exclude']
+            domain=None
+        else:
+            domain=jsobj['domain']
+            exclude=None
+
+        googleurl = 'https://www.google.com/search?q={}&num={}&hl={}'.format(urllib.parse.quote(kw), 100, 'zh-TW')
+        driver.get(googleurl)
+
+        time.sleep(6)
+        print(driver.current_url)
+        if 'sorry' in driver.current_url:
+            print("URL Error: Caught")
+            driver.quit()
+            driverclosed=1
+            return
+        # elmt = driver.find_element(By.XPATH, "//input[@name='q']")
+        # time.sleep(1)
+        # elmt.send_keys(kw)
+        # elmt.send_keys(Keys.ENTER)
+        # time.sleep(6)
+
+        elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']//a")
+
+        numresults=len(elmts)
+        print('搜尋結果數量',numresults)
+        if numresults==0:
+            driver.quit()
+            driverclosed=1
+            return
+
+        idx=1
+        found=False
+        test_lst=[]
+        txt_dict={}
+
+
+        for elmt in elmts:
+            href=elmt.get_attribute('href')
+            txt=elmt.text
+            if len(txt)>10:
+                if domain is not None:
+                    if domain in href and href not in blacklist:
+                        print('found....')
+                        print('clicked....')
+                        print(href)
+                        print("ranking", idx)
+                        found=True
+
+                        webdriver.ActionChains(driver).move_to_element(elmt).perform()
+                        # elmt.click()
+                        webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
+                        table.insert({'ranking':idx,'kw':kw,'results':numresults,'url':href,'title':txt,'dt':datetime.datetime.now(),'client':jsobj['cust']})
+            
+                        time.sleep(5)
+                        page_height = driver.execute_script("return document.body.scrollHeight")
+
+                        scroll_step = page_height // 4
+                        current_height = 0
+
+                        while current_height < page_height:
+                            driver.execute_script(f"window.scrollTo(0, {current_height + scroll_step});")
+                            time.sleep(3)
+                            current_height += scroll_step
+
+                        time.sleep(10)
+                        break
+                else:
+                    ex=False
+                    for ee in exclude:
+                        if ee in href:
+                            ex=True
+                    if not ex:
+                        test_lst.append(elmt)
+                        txt_dict[elmt]=txt
+                    
+            idx+=1
+
+        if exclude is not None:
+            print('exclude')
+            elmt=random.choice(test_lst[5:])
+            print(elmt)
+            print(txt_dict[elmt])
+
+            webdriver.ActionChains(driver).move_to_element(elmt).perform()
+            elmt.click()
+#            webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
+            time.sleep(5)
+
+        if not found: #don't waste resources, pick a random link as long as it is ok
+            attempt=0
+            pick=''
+        '''
+            negativeflag=True
+            while negativeflag==True:
+                attempt+=1
+                negativeflag=False
+                pick = random.choice(elmts)
+                href = pick.get_attribute('href')
+                if href in blacklist:
+                    negativeflag=True
+                ''''''try:
+                    content = pick.find_element(By.XPATH, "//em[@class='VwiC3b yXK7lf lyLwlc yDYNvb W8l4ac lEBKkf']/").text
+                    print(content)
+                    if "陳百欽" not in content:
+                        Exception
+                except:
+                    print("Not Found")
+                    negativeFlag = True''''''
+                if attempt==100:
+                    print("Action Terminated")
+                    break
+            print(href)
+            webdriver.ActionChains(driver).move_to_element(pick).perform()
+            webdriver.ActionChains(driver).move_to_element(pick).click().perform()
+            time.sleep(10)
+            #table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄','client':jsobj['cust']})
+        '''
+
+
+    except:
+        print('exception')
+        traceback.print_exc()
+
+    driver.quit()
+    driverclosed=1
+    # sys.exit()
+
+def exe():
+    try: # OLD TABLE NAME: seo.seo_jobs
+        db = dataset.connect('postgresql://postgres:eyJhbGciOiJI@172.105.241.163:5432/postgres')
+        '''cursor=db.query("select json from public.seo_jobs_temp where cust='啟翔' and plan='形象SEO' and json like '%陳百欽%' and (json like '%chinatimes.com%') order by random() limit 1")
+        for c in cursor:
+            js=json.loads(c['json'])
+            prefix=js['prefix']
+            postfix=js['postfix']
+            domain=js['domain'][0]
+            positive=js['positive']
+            rnd=js['rnd']
+
+        kw=''
+        while '陳百欽' not in kw:
+            kw=''
+            kw1=random.choice(positive)
+            kw2=random.choice(rnd)
+            kw=prefix+" "+kw2+" "+kw1
+            code='03'
+'''
+        kwlist = ['創新園區','產業聚落','桃園新屋','研發中心','航太工業','節能減碳','中小企業','高端市場','經營哲學','防疫門','Bellavita','IKEA','馬達','家具','歐美','家具家飾','歐美日','台北101','鋁材帷幕牆','金屬合金','鋁產品製程','台灣鋁業市占率','外銷市場','緬甸設廠','產能需求','EMBA','汽車應用','高端價值','W Hotel','北歐家具','日本家具品牌','醫療產業','循環經濟','歐美日訂單','藍海策略','重圍突破','航太產業','LED','綠色','台灣人才','國際市場競爭力','東協','產業升級','光電屋頂','優勢國際綠能公司','台北小巨蛋','大安森林公園','太陽能發電站','綠能環保科技園區','發電設備']
+
+        domain='chinatimes.com'
+        kw=random.choice(kwlist) + ' 陳百欽'
+
+        run_once({'domain':domain,'kw':kw, 'cust':'啟翔'},db)
+        try:
+            if driver is not None:
+                try:
+                    driver.quit()
+                except:
+                    pass
+        except:
+            pass
+        cursor=None
+        driver=None
+        db.close()
+        print("Completed")
+        time.sleep(61)
+    except:
+        traceback.print_exc()
+        print("Execution Error")
+        try:
+            if driver is not None:
+                try:
+                    driver.quit()
+                except:
+                    pass
+        except:
+            pass
+        cursor=None
+        driver=None
+        db.close()
+        time.sleep(20)
+
+def cleanup():
+    if driverclosed == 0:
+        try:
+            driver.quit()
+        except:
+            pass
+
+if __name__ == '__main__':
+    runcount=1
+    while True:
+        print("Run "+ str(runcount))
+        start_time = time.time()
+        p = multiprocessing.Process(target=exe)
+        p.start()
+        p.join(120)
+        if p.is_alive():
+            print("Overtime")
+            p.kill()
+            cleanup()
+        p.join()
+        duration = time.time()-start_time
+        print("Runs: " + str(runcount) + " | Duration: " + str(duration))
+        runcount+=1