123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114 |
- import traceback
- from selenium import webdriver
- from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
- import time
- import os
- import datetime
- import urllib.parse
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support import expected_conditions as EC
- import codecs
- import random
- from bs4 import BeautifulSoup
- import requests
- import time
- # import rpyc
- import sys
- import docker
- # import googlesearch
- import codecs
- import sys
- import time
- import dataset
- import os
- import pymysql
- pymysql.install_as_MySQLdb()
- def process_one(driver):
- lst=[]
- elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a")
- for elmt in elmts:
- try:
- href=elmt.get_attribute('href')
- # print(href)
- txt=elmt.text.split('\n')
- print(txt[0])
- lst.append({'title':txt[0],'url':href})
- except:
- print('href2 exception')
- traceback.print_exc()
- return lst
- def process_query(driver,url):
- try:
- driver.get(url)
- time.sleep(4)
- elmt=driver.find_element_by_xpath("//a[contains(@href,'mailto')]")
- print(elmt.text)
- print(elmt.get_attribute('href'))
- txt=elmt.get_attribute('href')
- txt=txt.replace('mailto:','')
- if 'mailto:?subject=' in txt:
- return None
- return txt
- except:
- print('not found')
- return None
- # time.sleep(9999)
- # try:
- # elmt=driver.find_element_by_xpath("//a[@id='pnnext']")
- # except:
- # traceback.print_exc()
- # print('pnnext exception')
- # break
- # time.sleep(1.5)
- # return totallst
- result=[]
- driver=None
- path = '/Users/zooeytsai/Downloads/chromedriver'
- def restart_browser():
- # os.system('docker container restart p4444')
- # time.sleep(10)
- options = webdriver.ChromeOptions()
- options.add_argument("--headless")
- options.add_argument("start-maximized")
- options.add_argument('user-data-dir=/Users/zooeytsai/Library/Application Support/Google/Chrome/Default')
- # options.add_argument('--profile-directory=Profile 77')
- options.add_argument('--profile-directory=Default')
- driver=webdriver.Chrome(options=options,executable_path=path)
- #driver = webdriver.Remote(
- # command_executor='http://127.0.0.1:4444/wd/hub',
- #desired_capabilities=options.to_capabilities())
- # desired_capabilities=DesiredCapabilities.CHROME)
- driver.set_window_size(1400,1000)
- return driver
- db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
- # cursor=db.query('select title,url,tag from term_gsearch where url not in (select url from term_progress) and tag like "區塊鏈" order by rand()')
- cursor=db.query('select title,url,tag from term_gsearch where tag like "區塊鏈" order by rand()')
- lst=[]
- for c in cursor:
- lst.append(c)
- table=db['term_progress']
- driver=restart_browser()
- for c in lst:
- email=process_query(driver,c['url'])
- print(email)
- c['title']=c['title'].replace('聯絡我們 - ','')
- c['title']=c['title'].replace('聯絡我們-','')
- c['title']=c['title'].replace('聯絡我們|','')
- c['title']=c['title'].replace('聯絡我們 |','')
- c['title']=c['title'].replace('聯絡我們:','')
- c['title']=c['title'].replace('股份有限公司','')
- c['title']=c['title'].replace('有限公司','')
- c['title']=c['title'].replace('聯絡我們','')
- table.insert({'title':c['title'],'url':c['url'],'email':email,'tag':c['tag']})
- time.sleep(2)
|