Jared 3 gadi atpakaļ
vecāks
revīzija
5eb6b811ef
2 mainītis faili ar 158 papildinājumiem un 0 dzēšanām
  1. 108 0
      choozmo/term_get_email.py
  2. 50 0
      choozmo/term_gsearch.py

+ 108 - 0
choozmo/term_get_email.py

@@ -0,0 +1,108 @@
+import traceback
+from selenium import webdriver
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+import time
+import os
+import datetime
+import urllib.parse
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+import codecs
+import random
+from bs4 import BeautifulSoup
+import requests
+import time
+import rpyc
+import sys
+import docker
+import  googlesearch
+import codecs
+import sys
+import time
+import dataset
+import os
+
+def process_one(driver):
+    lst=[]
+    elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a")
+    for elmt in elmts:
+        try:
+            href=elmt.get_attribute('href')
+#            print(href)
+            txt=elmt.text.split('\n')
+            print(txt[0])
+            lst.append({'title':txt[0],'url':href})
+        except:
+            print('href2 exception')
+            traceback.print_exc()
+    return lst
+
+def process_query(driver,url):
+    try:
+        driver.get(url)
+        time.sleep(4)
+        elmt=driver.find_element_by_xpath("//a[contains(@href,'mailto')]")
+        print(elmt.text)
+        print(elmt.get_attribute('href'))
+        txt=elmt.get_attribute('href')
+        txt=txt.replace('mailto:','')
+        if 'mailto:?subject=' in txt:
+            return None
+
+        return txt
+    except:
+        print('not found')
+        return None    
+#    time.sleep(9999)
+#        try:
+#            elmt=driver.find_element_by_xpath("//a[@id='pnnext']")
+#        except:
+#            traceback.print_exc()
+#            print('pnnext exception')
+#            break
+#        time.sleep(1.5)
+#    return totallst
+
+
+result=[]
+driver=None
+
+def restart_browser():
+#    os.system('docker container restart p4444')
+#    time.sleep(10)
+
+    options = webdriver.ChromeOptions()
+    options.add_argument("start-maximized")
+    options.add_argument('user-data-dir=C:\\Users\\jared\\AppData\\Local\\Google\\Chrome\\User Data')
+#    options.add_argument('--profile-directory=Profile 77')
+    options.add_argument('--profile-directory=Default')
+
+    driver=webdriver.Chrome(chrome_options=options)
+    #driver = webdriver.Remote(
+    #    command_executor='http://127.0.0.1:4444/wd/hub',
+    #desired_capabilities=options.to_capabilities())
+#    desired_capabilities=DesiredCapabilities.CHROME)
+    driver.set_window_size(1400,1000)
+    return driver
+
+db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
+cursor=db.query('select title,url from term_gsearch where url not in (select url from term_progress) order by rand()')
+lst=[]
+for c in cursor:
+    lst.append(c)
+    
+table=db['term_progress']
+driver=restart_browser()
+for c in lst:
+    email=process_query(driver,c['url'])
+    c['title']=c['title'].replace('聯絡我們 - ','')
+    c['title']=c['title'].replace('聯絡我們-','')
+    c['title']=c['title'].replace('聯絡我們|','')
+    c['title']=c['title'].replace('聯絡我們 |','')
+    c['title']=c['title'].replace('聯絡我們:','')
+    c['title']=c['title'].replace('股份有限公司','')
+    c['title']=c['title'].replace('有限公司','')
+
+    table.insert({'title':c['title'],'url':c['url'],'email':email})
+#    time.sleep(3)

+ 50 - 0
choozmo/term_gsearch.py

@@ -0,0 +1,50 @@
+import gsearch_general
+import sys
+import dataset
+import datetime
+
+db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
+table=db['term_gsearch']
+
+
+driver=gsearch_general.restart_browser()
+#idx=gsearch_general.process_query(driver,'團購 email 牛',number_results=100,language_code='zh-TW')
+#idx=gsearch_general.process_query(driver,'site:facebook.com pages  inurl:about 其他聯絡資訊 購',number_results=100,language_code='zh-TW')
+#idx=gsearch_general.process_query(driver,'聯絡我們 mailto 批發',number_results=100,language_code='zh-TW')
+#idx=gsearch_general.process_query(driver,'聯絡我們 mailto 蛋捲',number_results=100,language_code='zh-TW')
+#idx=gsearch_general.process_query(driver,'聯絡我們 mail 肉乾',number_results=100,language_code='zh-TW')
+#idx=gsearch_general.process_query(driver,'聯絡我們 mail 寵物',number_results=100,language_code='zh-TW')
+#idx=gsearch_general.process_query(driver,'聯絡我們 mail 雞精',number_results=100,language_code='zh-TW')
+#idx=gsearch_general.process_query(driver,'聯絡我們 mail 設計',number_results=100,language_code='zh-TW')
+#idx=gsearch_general.process_query(driver,'聯絡我們 mail 民宿',number_results=100,language_code='zh-TW')
+#idx=gsearch_general.process_query(driver,'聯絡我們 mail 面膜',number_results=100,language_code='zh-TW')
+#idx=gsearch_general.process_query(driver,'聯絡我們 mail 杏仁',number_results=100,language_code='zh-TW')
+#idx=gsearch_general.process_query(driver,'聯絡我們 mail 戶外',number_results=100,language_code='zh-TW')
+#idx=gsearch_general.process_query(driver,'聯絡我們 mail 嬰兒',number_results=100,language_code='zh-TW')
+#idx=gsearch_general.process_query(driver,'聯絡我們 mail 燈泡',number_results=100,language_code='zh-TW')
+#idx=gsearch_general.process_query(driver,'聯絡我們 mail 焦糖',number_results=100,language_code='zh-TW')
+#idx=gsearch_general.process_query(driver,'聯絡我們 mail 芝麻',number_results=100,language_code='zh-TW')
+#idx=gsearch_general.process_query(driver,'聯絡我們 mail 網購',number_results=100,language_code='zh-TW')
+#idx=gsearch_general.process_query(driver,'聯絡我們 mail 膠囊',number_results=100,language_code='zh-TW')
+#idx=gsearch_general.process_query(driver,'聯絡我們 mail 實業',number_results=100,language_code='zh-TW')
+#idx=gsearch_general.process_query(driver,'聯絡我們 mail 工作室',number_results=100,language_code='zh-TW')
+#idx=gsearch_general.process_query(driver,'聯絡我們 mail 企業社',number_results=100,language_code='zh-TW')
+#idx=gsearch_general.process_query(driver,'聯絡我們 mail 電商',number_results=100,language_code='zh-TW')
+#idx=gsearch_general.process_query(driver,'聯絡我們 mail 社會企業',number_results=100,language_code='zh-TW')
+idx=gsearch_general.process_query(driver,'聯絡我們 mail 商城',number_results=100,language_code='zh-TW')
+
+
+
+
+
+
+print(idx)
+for x in idx:
+    x['dt']=datetime.datetime.now()
+    table.insert(x)
+
+if idx==None:
+    print(driver.page_source)
+    if '我們的系統偵測到您的電腦網路送出的流量有異常情況' in driver.page_source:
+        print('baned.....')
+        sys.exit()