term_get_email.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. import traceback
  2. from selenium import webdriver
  3. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  4. import time
  5. import os
  6. import datetime
  7. import urllib.parse
  8. from selenium.webdriver.support.ui import WebDriverWait
  9. from selenium.webdriver.common.by import By
  10. from selenium.webdriver.support import expected_conditions as EC
  11. import codecs
  12. import random
  13. from bs4 import BeautifulSoup
  14. import requests
  15. import time
  16. # import rpyc
  17. import sys
  18. import docker
  19. # import googlesearch
  20. import codecs
  21. import sys
  22. import time
  23. import dataset
  24. import os
  25. import pymysql
  26. pymysql.install_as_MySQLdb()
  27. def process_one(driver):
  28. lst=[]
  29. elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a")
  30. for elmt in elmts:
  31. try:
  32. href=elmt.get_attribute('href')
  33. # print(href)
  34. txt=elmt.text.split('\n')
  35. print(txt[0])
  36. lst.append({'title':txt[0],'url':href})
  37. except:
  38. print('href2 exception')
  39. traceback.print_exc()
  40. return lst
  41. def process_query(driver,url):
  42. try:
  43. driver.get(url)
  44. time.sleep(4)
  45. elmt=driver.find_element_by_xpath("//a[contains(@href,'mailto')]")
  46. print(elmt.text)
  47. print(elmt.get_attribute('href'))
  48. txt=elmt.get_attribute('href')
  49. txt=txt.replace('mailto:','')
  50. if 'mailto:?subject=' in txt:
  51. return None
  52. return txt
  53. except:
  54. print('not found')
  55. return None
  56. # time.sleep(9999)
  57. # try:
  58. # elmt=driver.find_element_by_xpath("//a[@id='pnnext']")
  59. # except:
  60. # traceback.print_exc()
  61. # print('pnnext exception')
  62. # break
  63. # time.sleep(1.5)
  64. # return totallst
  65. result=[]
  66. driver=None
  67. path = '/Users/zooeytsai/Downloads/chromedriver'
  68. def restart_browser():
  69. # os.system('docker container restart p4444')
  70. # time.sleep(10)
  71. options = webdriver.ChromeOptions()
  72. options.add_argument("--headless")
  73. options.add_argument("start-maximized")
  74. options.add_argument('user-data-dir=/Users/zooeytsai/Library/Application Support/Google/Chrome/Default')
  75. # options.add_argument('--profile-directory=Profile 77')
  76. options.add_argument('--profile-directory=Default')
  77. driver=webdriver.Chrome(options=options,executable_path=path)
  78. #driver = webdriver.Remote(
  79. # command_executor='http://127.0.0.1:4444/wd/hub',
  80. #desired_capabilities=options.to_capabilities())
  81. # desired_capabilities=DesiredCapabilities.CHROME)
  82. driver.set_window_size(1400,1000)
  83. return driver
  84. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  85. # cursor=db.query('select title,url,tag from term_gsearch where url not in (select url from term_progress) and tag like "區塊鏈" order by rand()')
  86. cursor=db.query('select title,url,tag from term_gsearch where tag like "區塊鏈" order by rand()')
  87. lst=[]
  88. for c in cursor:
  89. lst.append(c)
  90. table=db['term_progress']
  91. driver=restart_browser()
  92. for c in lst:
  93. email=process_query(driver,c['url'])
  94. print(email)
  95. c['title']=c['title'].replace('聯絡我們 - ','')
  96. c['title']=c['title'].replace('聯絡我們-','')
  97. c['title']=c['title'].replace('聯絡我們|','')
  98. c['title']=c['title'].replace('聯絡我們 |','')
  99. c['title']=c['title'].replace('聯絡我們:','')
  100. c['title']=c['title'].replace('股份有限公司','')
  101. c['title']=c['title'].replace('有限公司','')
  102. c['title']=c['title'].replace('聯絡我們','')
  103. table.insert({'title':c['title'],'url':c['url'],'email':email,'tag':c['tag']})
  104. time.sleep(2)