Your Name 3 anos atrás
pai
commit
299432396c

+ 9 - 0
similar_server/clienttest.py

@@ -0,0 +1,9 @@
+import rpyc
+import time
+conn = rpyc.connect("localhost",12345)
+conn.root.get_url('https://yahoo.com')
+time.sleep(10)
+conn.root.get_url('https://google.com')
+
+
+#https://data.similarweb.com/api/v1/data?domain=cnn.com

+ 67 - 0
similar_server/sel_server.py

@@ -0,0 +1,67 @@
+from datetime import datetime
+import os
+import sys
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait, Select
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.remote.webdriver import WebDriver
+import time
+import json
+import rpyc
+from rpyc.utils.server import ThreadedServer # or ForkingServer
+
+
+
+def send(driver, cmd, params={}):
+    resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
+    url = driver.command_executor._url + resource
+    body = json.dumps({'cmd': cmd, 'params': params})
+    response = driver.command_executor._request('POST', url, body)
+#    if response['status']:
+#        raise Exception(response.get('value'))
+    return response.get('value')
+
+def add_script(driver, script):
+    send(driver, "Page.addScriptToEvaluateOnNewDocument", {"source": script})
+
+def init_webdriver():
+    WebDriver.add_script = add_script
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    options = webdriver.ChromeOptions()
+    driver = webdriver.Chrome(  chrome_options=options)
+    return driver
+
+#global driver
+class MyService(rpyc.Service):
+    def process(self,url):
+        self.driver.add_script('const setProperty = () => {     Object.defineProperty(navigator, "webdriver", {       get: () => false,     }); }; setProperty();')
+        print('add url.............')
+        print(self.driver)
+
+    def __init__(self):
+        self.driver = None
+        try:
+            self.driver = init_webdriver()
+            print(self.driver )
+        except Exception as e:
+            raise e
+        finally:
+            True
+        pass
+    
+    def exposed_get_url(self,url):
+        self.driver.add_script('const setProperty = () => {     Object.defineProperty(navigator, "webdriver", {       get: () => false,     }); }; setProperty();')
+        print('add url.............')
+        print(self.driver)
+        self.driver.get(url)
+
+
+
+
+if __name__ == "__main__":
+    server = ThreadedServer(MyService(), port = 12345,protocol_config={
+    'allow_public_attrs': True,
+})
+    server.start()

+ 73 - 0
tests/bypass.py

@@ -0,0 +1,73 @@
+"""
+Example to bypass distil security (https://www.distilnetworks.com/) with Selenium.
+They use the javascript field navigator.webdriver to ban Selenium
+The solution is to inject javascript code before the laoding og the webpage, to set webdriver to false
+Works only with chromium driver
+"""
+
+from datetime import datetime
+import os
+import sys
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait, Select
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.remote.webdriver import WebDriver
+import time
+import json
+
+
+def send(driver, cmd, params={}):
+    """
+    Send command to chromium driver
+    """
+    resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
+    url = driver.command_executor._url + resource
+    body = json.dumps({'cmd': cmd, 'params': params})
+    response = driver.command_executor._request('POST', url, body)
+#    if response['status']:
+#        raise Exception(response.get('value'))
+    return response.get('value')
+
+def add_script(driver, script):
+    """
+    Inject script before loading page
+    Cf: https://stackoverflow.com/a/47298910
+    """
+    send(driver, "Page.addScriptToEvaluateOnNewDocument", {"source": script})
+
+
+def process(driver):
+    driver.add_script('const setProperty = () => {     Object.defineProperty(navigator, "webdriver", {       get: () => false,     }); }; setProperty();')
+    # load a page
+#    driver.get('https://www.similarweb.com')
+    driver.get('about:blank')
+    time.sleep(20)
+
+
+
+def init_webdriver():
+    """
+    Init selnium web driver for scraping website 
+    """
+    WebDriver.add_script = add_script
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+#    driver_path = r'%s/lib/chromedriver' % dir_path
+    options = webdriver.ChromeOptions()
+    driver = webdriver.Chrome(  chrome_options=options)
+    return driver
+
+
+if __name__ == '__main__':
+
+    driver = None
+    try:
+        driver = init_webdriver()
+        process(driver)
+    except Exception as e:
+#        logger.error('Error during process %s' % e)
+        raise e
+    finally:
+        if driver is not None:
+            driver.close()

+ 69 - 0
tests/sel_local.py

@@ -0,0 +1,69 @@
+import sys
+import time
+sys.path.insert(0,'./kw_tools/web')
+import browser_common
+from selenium import webdriver
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
+
+jb=browser_common.JBrowser()
+#jb.set_profile_path('Profile 7')
+jb.set_profile_path(None)
+
+#jb.get('https://www.similarweb.com/website/yahoo.com/')
+#jb.get('https://www.similarweb.com/website/ptt.cc/')
+#jb.get('https://www.similarweb.com/website/591.com.tw/')
+#jb.get('https://data.similarweb.com/api/v1/data?domain=591.com.tw')
+
+driver=jb.get_driver()
+
+
+jb.get('https://google.com')
+#driver.add_script('const setProperty = () => {     Object.defineProperty(navigator, "webdriver", {       get: () => false,     }); }; setProperty();')
+
+#jb.get('https://www.similarweb.com/')
+
+#jb.get('https://data.similarweb.com/api/v1/data?domain=cnn.com')
+
+
+#jb.get('https://www.similarweb.com/')
+time.sleep(2)
+
+driver.execute_script("window.location.href = 'https://www.similarweb.com';")
+
+#jb.get('https://www.similarweb.com/')
+
+
+elmt=driver.find_element_by_xpath("//input[@class='app-search__input']")
+webdriver.ActionChains(driver).move_to_element(elmt).perform()
+webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
+webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
+
+time.sleep(5)
+elmt.send_keys('similarweb.com')
+
+elmt=driver.find_element_by_xpath("//button[@class='swui-button swui-button--solid swui-button--primary swui-button--brand swui-button--sm']")
+webdriver.ActionChains(driver).move_to_element(elmt).perform()
+webdriver.ActionChains(driver).move_to_element(elmt).click().perform()
+time.sleep(10)
+
+#src=driver.page_source
+#print(src)
+
+#time.sleep(9999)
+
+
+
+
+
+#class="websiteRanks-valueContainer js-websiteRanksValue"
+
+elmt=driver.find_element_by_xpath("//a[@data-analytics-label='Country Rank/158']")
+print(elmt.text)
+elmt=driver.find_element_by_xpath("//a[@data-analytics-label='Category Rank/computers-electronics-and-technology/social-networks-and-online-communities']")
+print(elmt.text)
+
+elmts=driver.find_elements_by_xpath("//div[@class='websiteRanks-valueContainer js-websiteRanksValue']")
+for elmt in elmts:
+    print(elmt.text)
+

+ 2 - 0
tests/seltest.py

@@ -13,5 +13,7 @@ jb.get('https://www.similarweb.com/website/google.com/')
 
 driver=jb.get_driver()
 
+#class="websiteRanks-valueContainer js-websiteRanksValue"
+
 src=driver.page_source
 print(src)

+ 55 - 5
web/browser_common.py

@@ -1,15 +1,44 @@
 from selenium import webdriver
+#from seleniumwire import webdriver
 import time
 #import networkx as nx
 #import dataset
 #import pickle
 #import codecs
 from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait, Select
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.remote.webdriver import WebDriver
 import sys
 import os
 import time
 import re
 
+def interceptor(request):
+    del request.headers['Referer']  # Remember to delete the header first
+    request.headers['Referer'] = 'https://www.google.com/'  # Spoof the referer
+
+def send(driver, cmd, params={}):
+    """
+    Send command to chromium driver
+    """
+    resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
+    url = driver.command_executor._url + resource
+    body = json.dumps({'cmd': cmd, 'params': params})
+    response = driver.command_executor._request('POST', url, body)
+    if response['status']:
+        raise Exception(response.get('value'))
+    return response.get('value')
+
+def add_script(driver, script):
+    """
+    Inject script before loading page
+    Cf: https://stackoverflow.com/a/47298910
+    """
+    send(driver, "Page.addScriptToEvaluateOnNewDocument", {"source": script})
+
 class JBrowser:
     def __init__(self):
         if os.name=='nt':
@@ -23,15 +52,36 @@ class JBrowser:
         option.add_argument('--allow-running-insecure-content')
 #        option.add_argument('--headless')
 
-        if os.name=='nt':
-            option.add_argument("--user-data-dir='C:\\Users\\jared\\AppData\\Local\\Google\\Chrome\\User Data\\"+self.profilepath+"\\'")
-        else:
-            option.add_argument("--user-data-dir="+self.profilepath)
-            option.add_argument('--profile-directory="Profile 1"')
+#        option.add_argument("window-size=1280,800")
+#        option.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36")
+        option.add_argument('--disable-blink-features=AutomationControlled')
+        if profilepath is not None:
+            if os.name=='nt' :
+                option.add_argument("--user-data-dir=C:\\Users\\jared\\AppData\\Local\\Google\\Chrome\\User Data\\"+self.profilepath+"\\")
+
+#                option.add_argument("--user-data-dir='C:\\Users\\jared\\AppData\\Local\\Google\\Chrome\\User Data\\"+self.profilepath+"\\'")
+#                option.add_argument("--user-data-dir=C:\\Users\\jared\\AppData\\Local\\Google\\Chrome\\User Data\\")
 
+#                option.add_argument("--profile-directory='"+self.profilepath+"'")
+
+            else:
+                option.add_argument("--user-data-dir="+self.profilepath)
+                option.add_argument('--profile-directory="Profile 1"')
+
+        option.add_experimental_option("excludeSwitches", ["enable-automation"])
+        option.add_experimental_option('useAutomationExtension', False)
         self.option=option
+
+        WebDriver.add_script = add_script
+
         driver = webdriver.Chrome(options=option)
+        driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
+        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
+        driver.request_interceptor = interceptor
+
         self.driver=driver
+        driver.delete_all_cookies()
+ 
 #        executor_url = driver.command_executor._url
 #        session_id = driver.session_id