term_get_email.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. import traceback
  2. from selenium import webdriver
  3. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  4. import time
  5. import os
  6. import datetime
  7. import urllib.parse
  8. from selenium.webdriver.support.ui import WebDriverWait
  9. from selenium.webdriver.common.by import By
  10. from selenium.webdriver.support import expected_conditions as EC
  11. import codecs
  12. import random
  13. from bs4 import BeautifulSoup
  14. import requests
  15. import time
  16. import rpyc
  17. import sys
  18. import docker
  19. import googlesearch
  20. import codecs
  21. import sys
  22. import time
  23. import dataset
  24. import os
  25. def process_one(driver):
  26. lst=[]
  27. elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a")
  28. for elmt in elmts:
  29. try:
  30. href=elmt.get_attribute('href')
  31. # print(href)
  32. txt=elmt.text.split('\n')
  33. print(txt[0])
  34. lst.append({'title':txt[0],'url':href})
  35. except:
  36. print('href2 exception')
  37. traceback.print_exc()
  38. return lst
  39. def process_query(driver,url):
  40. try:
  41. driver.get(url)
  42. time.sleep(4)
  43. elmt=driver.find_element_by_xpath("//a[contains(@href,'mailto')]")
  44. print(elmt.text)
  45. print(elmt.get_attribute('href'))
  46. txt=elmt.get_attribute('href')
  47. txt=txt.replace('mailto:','')
  48. if 'mailto:?subject=' in txt:
  49. return None
  50. return txt
  51. except:
  52. print('not found')
  53. return None
  54. # time.sleep(9999)
  55. # try:
  56. # elmt=driver.find_element_by_xpath("//a[@id='pnnext']")
  57. # except:
  58. # traceback.print_exc()
  59. # print('pnnext exception')
  60. # break
  61. # time.sleep(1.5)
  62. # return totallst
  63. result=[]
  64. driver=None
  65. def restart_browser():
  66. # os.system('docker container restart p4444')
  67. # time.sleep(10)
  68. options = webdriver.ChromeOptions()
  69. options.add_argument("start-maximized")
  70. options.add_argument('user-data-dir=C:\\Users\\jared\\AppData\\Local\\Google\\Chrome\\User Data')
  71. # options.add_argument('--profile-directory=Profile 77')
  72. options.add_argument('--profile-directory=Default')
  73. driver=webdriver.Chrome(chrome_options=options)
  74. #driver = webdriver.Remote(
  75. # command_executor='http://127.0.0.1:4444/wd/hub',
  76. #desired_capabilities=options.to_capabilities())
  77. # desired_capabilities=DesiredCapabilities.CHROME)
  78. driver.set_window_size(1400,1000)
  79. return driver
  80. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  81. cursor=db.query('select title,url,tag from term_gsearch where url not in (select url from term_progress) order by rand()')
  82. lst=[]
  83. for c in cursor:
  84. lst.append(c)
  85. table=db['term_progress']
  86. driver=restart_browser()
  87. for c in lst:
  88. email=process_query(driver,c['url'])
  89. c['title']=c['title'].replace('聯絡我們 - ','')
  90. c['title']=c['title'].replace('聯絡我們-','')
  91. c['title']=c['title'].replace('聯絡我們|','')
  92. c['title']=c['title'].replace('聯絡我們 |','')
  93. c['title']=c['title'].replace('聯絡我們:','')
  94. c['title']=c['title'].replace('股份有限公司','')
  95. c['title']=c['title'].replace('有限公司','')
  96. c['title']=c['title'].replace('聯絡我們','')
  97. table.insert({'title':c['title'],'url':c['url'],'email':email,'tag':c['tag']})
  98. # time.sleep(3)