import traceback from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import time import os import datetime import urllib.parse from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC import codecs import random from bs4 import BeautifulSoup import requests import time # import rpyc import sys import docker # import googlesearch import codecs import sys import time import dataset import os import pymysql pymysql.install_as_MySQLdb() def process_one(driver): lst=[] elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a") for elmt in elmts: try: href=elmt.get_attribute('href') # print(href) txt=elmt.text.split('\n') print(txt[0]) lst.append({'title':txt[0],'url':href}) except: print('href2 exception') traceback.print_exc() return lst def process_query(driver,url): try: driver.get(url) time.sleep(4) elmt=driver.find_element_by_xpath("//a[contains(@href,'mailto')]") print(elmt.text) print(elmt.get_attribute('href')) txt=elmt.get_attribute('href') txt=txt.replace('mailto:','') if 'mailto:?subject=' in txt: return None return txt except: print('not found') return None # time.sleep(9999) # try: # elmt=driver.find_element_by_xpath("//a[@id='pnnext']") # except: # traceback.print_exc() # print('pnnext exception') # break # time.sleep(1.5) # return totallst result=[] driver=None path = '/Users/zooeytsai/Downloads/chromedriver' def restart_browser(): # os.system('docker container restart p4444') # time.sleep(10) options = webdriver.ChromeOptions() options.add_argument("--headless") options.add_argument("start-maximized") options.add_argument('user-data-dir=/Users/zooeytsai/Library/Application Support/Google/Chrome/Default') # options.add_argument('--profile-directory=Profile 77') options.add_argument('--profile-directory=Default') driver=webdriver.Chrome(options=options,executable_path=path) #driver = webdriver.Remote( # command_executor='http://127.0.0.1:4444/wd/hub', #desired_capabilities=options.to_capabilities()) # desired_capabilities=DesiredCapabilities.CHROME) driver.set_window_size(1400,1000) return driver db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4') # cursor=db.query('select title,url,tag from term_gsearch where url not in (select url from term_progress) and tag like "區塊鏈" order by rand()') cursor=db.query('select title,url,tag from term_gsearch where tag like "區塊鏈" order by rand()') lst=[] for c in cursor: lst.append(c) table=db['term_progress'] driver=restart_browser() for c in lst: email=process_query(driver,c['url']) print(email) c['title']=c['title'].replace('聯絡我們 - ','') c['title']=c['title'].replace('聯絡我們-','') c['title']=c['title'].replace('聯絡我們|','') c['title']=c['title'].replace('聯絡我們 |','') c['title']=c['title'].replace('聯絡我們:','') c['title']=c['title'].replace('股份有限公司','') c['title']=c['title'].replace('有限公司','') c['title']=c['title'].replace('聯絡我們','') table.insert({'title':c['title'],'url':c['url'],'email':email,'tag':c['tag']}) time.sleep(2)