import traceback from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import time import os import datetime import urllib.parse from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC import codecs import random from bs4 import BeautifulSoup import requests import time # import rpyc import sys import docker # import googlesearch import codecs import sys import time import dataset import os import html2text from userAgentRandomizer import userAgents from fp.fp import FreeProxy db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4') db2 = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4') cursor=db.query('select url from kw_url_search_result where url not in (select url from url_content) order by rand()') def restart_browser(): os.system('docker container restart proxy1') ua = userAgents() user_agent = ua.random() time.sleep(8) options = webdriver.ChromeOptions() # options.add_argument("--headless") # proxy = FreeProxy().get() # print(proxy) # sys.exit() options.add_argument('--proxy-server=socks5://127.0.0.1:9050') options.add_argument("--user-agent=" +user_agent) options.add_argument("--incognito") driver=webdriver.Chrome(options=options) driver.set_window_size(1400,1000) driver.delete_all_cookies() return driver def clean_txt(txt): fulltxt="" lines=txt.split("\n") beginning=False for l in lines: ltxt=l.strip() if ' * __'==ltxt: continue if '我要回覆'==ltxt: continue if beginning: fulltxt+=l+"\n" else: if ' * __ 訂閱文章' in l: beginning=True if ('__ 連結 __ 回報 __ 只看樓主 __ 列印' in l): break if '__ 連結 __ 回報 __ 只看此人 __ 列印' in l: break print(fulltxt) return fulltxt driver=restart_browser() table=db2['url_content'] for c in cursor: url=c['url'] print(c['url']) # driver.get('https://whatismyipaddress.com/') # time.sleep(9999) driver.get(c['url']) time.sleep(5) if 'Please Wait' in driver.title and 'Cloudflare' in driver.title: driver=restart_browser() continue src=driver.page_source h = html2text.HTML2Text() h.ignore_links = True txt=h.handle(src) resulttxt=clean_txt(txt) table.insert({'content':resulttxt,'url':url}) time.sleep(5) # print() # break #print(html2text.html2text("
Zed's dead baby, Zed's dead.
"))