choozmo
/
kw_tools


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
							import traceback
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import os
import datetime
import urllib.parse
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import codecs
import random
from bs4 import BeautifulSoup
import requests
import time
import rpyc
import sys
import docker
import  googlesearch
import codecs
import sys
import time
import dataset
import os

def process_one(driver):
    lst=[]
    elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a")
    for elmt in elmts:
        try:
            href=elmt.get_attribute('href')
#            print(href)
            txt=elmt.text.split('\n')
            print(txt[0])
            lst.append({'title':txt[0],'url':href})
        except:
            print('href2 exception')
            traceback.print_exc()
    return lst

def process_query(driver,url):
    try:
        driver.get(url)
        time.sleep(4)
        elmt=driver.find_element_by_xpath("//a[contains(@href,'mailto')]")
        print(elmt.text)
        print(elmt.get_attribute('href'))
        txt=elmt.get_attribute('href')
        txt=txt.replace('mailto:','')
        if 'mailto:?subject=' in txt:
            return None

        return txt
    except:
        print('not found')
        return None    
#    time.sleep(9999)
#        try:
#            elmt=driver.find_element_by_xpath("//a[@id='pnnext']")
#        except:
#            traceback.print_exc()
#            print('pnnext exception')
#            break
#        time.sleep(1.5)
#    return totallst


result=[]
driver=None

def restart_browser():
#    os.system('docker container restart p4444')
#    time.sleep(10)

    options = webdriver.ChromeOptions()
    options.add_argument("start-maximized")
    options.add_argument('user-data-dir=C:\\Users\\jared\\AppData\\Local\\Google\\Chrome\\User Data')
#    options.add_argument('--profile-directory=Profile 77')
    options.add_argument('--profile-directory=Default')

    driver=webdriver.Chrome(chrome_options=options)
    #driver = webdriver.Remote(
    #    command_executor='http://127.0.0.1:4444/wd/hub',
    #desired_capabilities=options.to_capabilities())
#    desired_capabilities=DesiredCapabilities.CHROME)
    driver.set_window_size(1400,1000)
    return driver

db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
cursor=db.query('select title,url,tag from term_gsearch where url not in (select url from term_progress) order by rand()')
lst=[]
for c in cursor:
    lst.append(c)
    
table=db['term_progress']
driver=restart_browser()
for c in lst:
    email=process_query(driver,c['url'])
    c['title']=c['title'].replace('聯絡我們 - ','')
    c['title']=c['title'].replace('聯絡我們-','')
    c['title']=c['title'].replace('聯絡我們|','')
    c['title']=c['title'].replace('聯絡我們 |','')
    c['title']=c['title'].replace('聯絡我們:','')
    c['title']=c['title'].replace('股份有限公司','')
    c['title']=c['title'].replace('有限公司','')
    c['title']=c['title'].replace('聯絡我們','')

    table.insert({'title':c['title'],'url':c['url'],'email':email,'tag':c['tag']})
#    time.sleep(3)