term_get_email.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. import traceback
  2. from selenium import webdriver
  3. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  4. import time
  5. import os
  6. import datetime
  7. import urllib.parse
  8. from selenium.webdriver.support.ui import WebDriverWait
  9. from selenium.webdriver.common.by import By
  10. from selenium.webdriver.support import expected_conditions as EC
  11. import codecs
  12. import random
  13. from bs4 import BeautifulSoup
  14. import requests
  15. import time
  16. # import rpyc
  17. import sys
  18. import docker
  19. <<<<<<< HEAD
  20. # import googlesearch
  21. =======
  22. # import googlesearch
  23. >>>>>>> 604b4f0737fe7f055de28e84a0a4697a346701cf
  24. import codecs
  25. import sys
  26. import time
  27. import dataset
  28. import os
  29. import pymysql
  30. pymysql.install_as_MySQLdb()
  31. def process_one(driver):
  32. lst=[]
  33. elmts=driver.find_elements_by_xpath("//div[@class='yuRUbf']/a")
  34. for elmt in elmts:
  35. try:
  36. href=elmt.get_attribute('href')
  37. # print(href)
  38. txt=elmt.text.split('\n')
  39. print(txt[0])
  40. lst.append({'title':txt[0],'url':href})
  41. except:
  42. print('href2 exception')
  43. traceback.print_exc()
  44. return lst
  45. def process_query(driver,url):
  46. try:
  47. driver.get(url)
  48. time.sleep(4)
  49. elmt=driver.find_element_by_xpath("//a[contains(@href,'mailto')]")
  50. print(elmt.text)
  51. print(elmt.get_attribute('href'))
  52. txt=elmt.get_attribute('href')
  53. txt=txt.replace('mailto:','')
  54. if 'mailto:?subject=' in txt:
  55. return None
  56. return txt
  57. except:
  58. print('not found')
  59. return None
  60. # time.sleep(9999)
  61. # try:
  62. # elmt=driver.find_element_by_xpath("//a[@id='pnnext']")
  63. # except:
  64. # traceback.print_exc()
  65. # print('pnnext exception')
  66. # break
  67. # time.sleep(1.5)
  68. # return totallst
  69. result=[]
  70. driver=None
  71. path = '/Users/zooeytsai/Downloads/chromedriver'
  72. def restart_browser():
  73. # os.system('docker container restart p4444')
  74. # time.sleep(10)
  75. options = webdriver.ChromeOptions()
  76. options.add_argument("--headless")
  77. options.add_argument("start-maximized")
  78. <<<<<<< HEAD
  79. options.add_argument('user-data-dir=C:\\Users\\jared\\AppData\\Local\\Google\\Chrome\\User Data')
  80. # options.add_argument('--profile-directory=Profile 77')
  81. # options.add_argument('--profile-directory=Default')
  82. driver=webdriver.Chrome(options=options,executable_path='C:\\Users\\user\\Downloads\\chromedriver_99\\chromedriver')
  83. =======
  84. options.add_argument('user-data-dir=/Users/zooeytsai/Library/Application Support/Google/Chrome/Default')
  85. # options.add_argument('--profile-directory=Profile 77')
  86. options.add_argument('--profile-directory=Default')
  87. driver=webdriver.Chrome(options=options,executable_path=path)
  88. >>>>>>> 604b4f0737fe7f055de28e84a0a4697a346701cf
  89. #driver = webdriver.Remote(
  90. # command_executor='http://127.0.0.1:4444/wd/hub',
  91. #desired_capabilities=options.to_capabilities())
  92. # desired_capabilities=DesiredCapabilities.CHROME)
  93. driver.set_window_size(1400,1000)
  94. return driver
  95. db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
  96. # cursor=db.query('select title,url,tag from term_gsearch where url not in (select url from term_progress) and tag like "區塊鏈" order by rand()')
  97. cursor=db.query('select title,url,tag from term_gsearch where tag like "區塊鏈" order by rand()')
  98. lst=[]
  99. for c in cursor:
  100. lst.append(c)
  101. table=db['term_progress']
  102. driver=restart_browser()
  103. for c in lst:
  104. email=process_query(driver,c['url'])
  105. print(email)
  106. c['title']=c['title'].replace('聯絡我們 - ','')
  107. c['title']=c['title'].replace('聯絡我們-','')
  108. c['title']=c['title'].replace('聯絡我們|','')
  109. c['title']=c['title'].replace('聯絡我們 |','')
  110. c['title']=c['title'].replace('聯絡我們:','')
  111. c['title']=c['title'].replace('股份有限公司','')
  112. c['title']=c['title'].replace('有限公司','')
  113. c['title']=c['title'].replace('聯絡我們','')
  114. table.insert({'title':c['title'],'url':c['url'],'email':email,'tag':c['tag']})
  115. time.sleep(2)