Jason hai 1 ano
pai
achega
767a0b0fa0
Modificáronse 1 ficheiros con 236 adicións e 0 borrados
  1. 236 0
      website_clickjobs/gen_seo2.py

+ 236 - 0
website_clickjobs/gen_seo2.py

@@ -0,0 +1,236 @@
+#import redis
+import time
+import traceback
+#import json
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+import time
+import os
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+import dataset
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.chrome.service import Service
+import json
+import random
+import time
+import datetime
+import sys
+import codecs
+import random
+import os
+import time
+import requests
+import pymysql
+import urllib.parse
+pymysql.install_as_MySQLdb()
+driver=None
+
+db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
+
+headers = {
+        "Authorization": "Bearer " + "6SDULL1Ebklduc6TFxa97AFto5Sj21kyJ30CxiLiSoi",
+        "Content-Type": "application/x-www-form-urlencoded"
+}
+
+def send_msg(kw):
+    params = {"message":kw}  
+    r = requests.post("https://notify-api.line.me/api/notify",headers=headers, params=params)
+
+blacklist = ['https://www.chinatimes.com/realtimenews/20220613003142-260402']
+
+
+def re_get_webdriver():
+    global port
+    global driver
+    result=[]
+    if driver is not None:
+        print('closing....')
+        driver.quit()
+        os.system('killall chrome')
+        print('quit....')
+        driver=None
+    try:
+        options = webdriver.ChromeOptions()
+        # options.add_argument("user-agent=%s" % user_agent)
+        options.add_argument('--headless')
+        options.add_argument("--incognito")
+        driver = webdriver.Chrome(options=options)
+        driver.delete_all_cookies()
+        driver.set_window_size(1400,1000)
+    except:
+        traceback.print_exc()
+        driver=None
+        return None
+
+def getDriver():
+    options = webdriver.ChromeOptions()
+    #options.add_argument("user-agent=%s" % rua())
+    options.add_argument('--headless')
+    options.add_argument('--incognito')
+    options.add_argument('--no-sandbox')
+    driver=webdriver.Chrome(options=options)
+    driver.set_window_size(1400,1000)
+    return driver
+
+
+def run_once(jsobj):
+
+    table=db['nda_log']
+    print(jsobj)
+    global driver
+
+
+#    i=random.randint(0,9)
+    i=100
+    driver=getDriver()
+    try:
+        kw=jsobj['kw']
+        if jsobj.get('domain') is None:
+            exclude=jsobj['exclude']
+            domain=None
+        else:
+            domain=jsobj['domain']
+            exclude=None
+
+        googleurl = 'https://www.google.com/search?q={}&num={}&hl={}'.format(urllib.parse.quote(kw), 100, 'zh-TW')
+        driver.get(googleurl)
+
+        time.sleep(6)
+        print(driver.current_url)
+        if 'sorry' in driver.current_url:
+            print("URL Error: Caught")
+            return
+        # elmt = driver.find_element(By.XPATH, "//input[@name='q']")
+        # time.sleep(1)
+        # elmt.send_keys(kw)
+        # elmt.send_keys(Keys.ENTER)
+        # time.sleep(6)
+
+        elmts = driver.find_elements(By.XPATH, "//div[@class='yuRUbf']//a")
+
+        numresults=len(elmts)
+        print('搜尋結果數量',numresults)
+        if numresults==0:
+            send_msg('stop working...')
+            sys.exit()
+
+        idx=1
+        found=False
+        test_lst=[]
+        txt_dict={}
+
+
+        for elmt in elmts:
+            href=elmt.get_attribute('href')
+            txt=elmt.text
+            if len(txt)>10:
+                if domain is not None:
+                    if domain in href:
+                        print('found....')
+                        print('clicked....')
+                        print(href)
+                        print("ranking", idx)
+                        found=True
+
+                        webdriver.ActionChains(driver).move_to_element(elmt).perform()
+                        # elmt.click()
+                        webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
+                        table.insert({'ranking':idx,'kw':kw,'results':numresults,'url':href,'title':txt,'dt':datetime.datetime.now(),'client':jsobj['cust']})
+                        time.sleep(5)
+                        page_height = driver.execute_script("return document.body.scrollHeight")
+
+                        scroll_step = page_height // 4
+                        current_height = 0
+
+                        while current_height < page_height:
+                            driver.execute_script(f"window.scrollTo(0, {current_height + scroll_step});")
+                            time.sleep(3)
+                            current_height += scroll_step
+
+                        time.sleep(10)
+                        break
+                else:
+                    ex=False
+                    for ee in exclude:
+                        if ee in href:
+                            ex=True
+                    if not ex:
+                        test_lst.append(elmt)
+                        txt_dict[elmt]=txt
+                    
+            idx+=1
+
+        if exclude is not None:
+            print('exclude')
+            elmt=random.choice(test_lst[5:])
+            print(elmt)
+            print(txt_dict[elmt])
+
+            webdriver.ActionChains(driver).move_to_element(elmt).perform()
+            elmt.click()
+#            webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
+            time.sleep(5)
+
+        if not found: #don't waste resources, pick a random link as long as it is ok
+            attempt=0
+            pick=''
+            negativeflag=True
+            while negativeflag==True:
+                attempt+=1
+                negativeflag=False
+                pick = random.choice(elmts)
+                href = pick.get_attribute('href')
+                if href in blacklist:
+                    negativeflag=True
+                '''try:
+                    content = pick.find_element(By.XPATH, "//em[@class='VwiC3b yXK7lf lyLwlc yDYNvb W8l4ac lEBKkf']/").text
+                    print(content)
+                    if "陳百欽" not in content:
+                        Exception
+                except:
+                    print("Not Found")
+                    negativeFlag = True'''
+                if attempt==100:
+                    print("Action Terminated")
+                    break
+            webdriver.ActionChains(driver).move_to_element(pick).perform()
+            webdriver.ActionChains(driver).move_to_element(pick).click().perform()
+            #table.insert({'ranking':-1,'kw':kw,'results':numresults,'url':'','title':'未收錄','client':jsobj['cust']})
+
+
+    except:
+        print('exception')
+        traceback.print_exc()
+
+    driver.quit()
+    # sys.exit()
+
+while True:
+    try:
+        cursor=db.query('select json from seo.seo_jobs where cust="啟翔" and plan="形象SEO" and json like "%陳百欽%" and (json like "%chinabiz.org.tw%" or json like "%vocus.cc%" or json like "%tw.news.yahoo.com%" or json like "%facebook.com%" or json like "%gvm.com.tw%" or json like "%fingermedia.tw%" or json like "%bg3.co%" or json like "%morningtaiwan.org%" or json like "%pchome.com.tw%" or json like "%twfile.com%" or json like "%twincn.com%" or json like "%theicons.net%" or json like "%nhu.edu.tw%") order by rand() limit 1')
+        for c in cursor:
+            js=json.loads(c['json'])
+            prefix=js['prefix']
+            postfix=js['postfix']
+            domain=js['domain'][0]
+            positive=js['positive']
+            rnd=js['rnd']
+
+        kw=''
+        while '陳百欽' not in kw:
+            kw=''
+            kw1=random.choice(positive)
+            kw2=random.choice(rnd)
+            kw=kw1+" "+prefix+" "+kw2
+            code='03'
+
+        run_once({'domain':domain,'kw':kw, 'cust':'啟翔'})
+        time.sleep(61)
+        cursor=None
+        driver=None
+    except:
+        traceback.print_exc()
+        print("Execution Error")
+        time.sleep(20)