choozmo
/
kw_tools


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
							import traceback
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import os
import datetime
import urllib.parse
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import codecs
import random
from bs4 import BeautifulSoup
import requests
import time
# import rpyc
import sys
import docker
# import googlesearch
import codecs
import sys
import time
import dataset
import os
import pymysql
pymysql.install_as_MySQLdb()

def process_one(driver):
    lst=[]
    elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a")
    for elmt in elmts:
        try:
            href=elmt.get_attribute('href')
#            print(href)
            txt=elmt.text.split('\n')
            print(txt[0])
            lst.append({'title':txt[0],'url':href})
        except:
            print('href2 exception')
            traceback.print_exc()
    return lst

def process_query(driver,url):
    try:
        driver.get(url)
        time.sleep(4)
        elmt=driver.find_element_by_xpath("//a[contains(@href,'mailto')]")
        print(elmt.text)
        print(elmt.get_attribute('href'))
        txt=elmt.get_attribute('href')
        txt=txt.replace('mailto:','')
        if 'mailto:?subject=' in txt:
            return None
        return txt
    except:
        print('not found')
        return None    
#    time.sleep(9999)
#        try:
#            elmt=driver.find_element_by_xpath("//a[@id='pnnext']")
#        except:
#            traceback.print_exc()
#            print('pnnext exception')
#            break
#        time.sleep(1.5)
#    return totallst


result=[]
driver=None
path = '/Users/zooeytsai/Downloads/chromedriver'

def restart_browser():
#    os.system('docker container restart p4444')
#    time.sleep(10)

    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("start-maximized")
    options.add_argument('user-data-dir=/Users/zooeytsai/Library/Application Support/Google/Chrome/Default')
#    options.add_argument('--profile-directory=Profile 77')
    options.add_argument('--profile-directory=Default')

    driver=webdriver.Chrome(options=options,executable_path=path)
    #driver = webdriver.Remote(
    #    command_executor='http://127.0.0.1:4444/wd/hub',
    #desired_capabilities=options.to_capabilities())
#    desired_capabilities=DesiredCapabilities.CHROME)
    driver.set_window_size(1400,1000)
    return driver

db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
# cursor=db.query('select title,url,tag from term_gsearch where url not in (select url from term_progress) and tag like "區塊鏈" order by rand()')
cursor=db.query('select title,url,tag from term_gsearch where tag like "區塊鏈" order by rand()')
lst=[]
for c in cursor:
    lst.append(c)

table=db['term_progress']
driver=restart_browser()
for c in lst:
    email=process_query(driver,c['url'])
    print(email)
    c['title']=c['title'].replace('聯絡我們 - ','')
    c['title']=c['title'].replace('聯絡我們-','')
    c['title']=c['title'].replace('聯絡我們|','')
    c['title']=c['title'].replace('聯絡我們 |','')
    c['title']=c['title'].replace('聯絡我們:','')
    c['title']=c['title'].replace('股份有限公司','')
    c['title']=c['title'].replace('有限公司','')
    c['title']=c['title'].replace('聯絡我們','')

    table.insert({'title':c['title'],'url':c['url'],'email':email,'tag':c['tag']})
    time.sleep(2)