Bläddra i källkod

Merge branch 'master' of http://git.choozmo.com:3000/choozmo/kw_tools

ming 3 år sedan
förälder
incheckning
5c6ce94bde
1 ändrade filer med 51 tillägg och 48 borttagningar
  1. 51 48
      ig_tags/ig_selenium.py

+ 51 - 48
ig_tags/ig_selenium.py

@@ -12,7 +12,9 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support.wait import WebDriverWait
 from selenium.webdriver.support import expected_conditions as ec
 # import dataset
+import ast
 import pymysql
+
 pymysql.install_as_MySQLdb()
 
 import pd
@@ -21,98 +23,99 @@ import setting
 
 account = ['chenlinrain','enjoylisteningswift','novak_goodenough']
 
-password = ['weareyoung12']
+account = pd.account
+password = pd.password
 
-path = "C:\\Users\\user\\Downloads\\chromedriver_win32"
-sbaccount = account[1]
+path = '/Users/zooeytsai/Downloads/chromedriver'
+sbaccount = account[2]
 sbpd = password[1]
 options = webdriver.ChromeOptions()
 # options.add_argument("--headless") # 視窗背後執行
-options.add_argument("user-agent=%s" % setting.rua())
+options.add_argument("user-agent=%s" % rua())
 
 
 def download_post():
-    hashtag = '寵物鮮食'
+    hashtag = '寵物蛋糕'
     url = f"https://www.instagram.com/explore/tags/{hashtag}/"
     s = Service(path)
     driver = webdriver.Chrome(service=s, options=options)
     driver.implicitly_wait(3)
     driver.get('https://www.instagram.com/')
     time.sleep(5)
-    driver.find_element(By.NAME,'username').send_keys(sbaccount)
-    driver.find_element(By.NAME,'password').send_keys(sbpd)
+    driver.find_element(By.NAME, 'username').send_keys(sbaccount)
+    driver.find_element(By.NAME, 'password').send_keys(sbpd)
     time.sleep(3)
-    driver.find_element(By.XPATH,'//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
-    driver.implicitly_wait(3)
-    driver.find_element(By.XPATH,'//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
+    driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
+    time.sleep(3)
+    driver.find_element(By.XPATH, '//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
     time.sleep(3)
     driver.get(url)
     time.sleep(3)
-    limit_of_posts = 1
-    limit_of_scroll = 3
+    limit_of_posts = 10
+    limit_of_scroll = 100
     c = 0
-    c_sroll=0
-    wait = WebDriverWait(driver, 10)
-    last_height = driver.execute_script(
-        "window.scrollTo(0, document.body.scrollHeight);var scrolldown=document.body.scrollHeight;return scrolldown;")
+    c_sroll = 0
     link = []
+    
     while True:
         print(c, c_sroll)
-
-        # href_element = wait.until(ec.visibility_of_all_elements_located((By.XPATH, "//a[@href]")))
-        # for i in href_element:
-        #     link.append(i.get_attribute('href'))
-        # print(link)
         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
         time.sleep(3)
-        # new_height = driver.execute_script("return document.body.scrollHeight")
-        # driver.implicitly_wait(3)
-        c+=1
-        c_sroll+=1
+        html = driver.page_source
+        soup = BeautifulSoup(html, 'lxml')
+        for elem in soup.select('article div div div div a'):
+            print(elem['href'])
+            if elem['href'] not in link:
+                link.append(elem['href'])
+        c += 1
+        c_sroll += 1
         if c > limit_of_posts:
             print('中場休息')
-            time.sleep(randint(15,30))
-            c=0
-        if c_sroll>limit_of_scroll:
+            time.sleep(randint(15, 30))
+            c = 0
+        if c_sroll > limit_of_scroll:
             break
-    html = driver.page_source
-    soup = BeautifulSoup(html, 'lxml')
-    for elem in soup.select('article div div div div a'):
-        if elem['href'] not in link:
-            link.append(elem['href'])
+        print(len(link))
     driver.close()
     print(len(link))
+    with open('link.txt', 'w') as f:
+        f.write(str(link))
     return link, hashtag
 
 
 def hashtag():
     input_hashtag = 'pet_cake'
     collect_hashtag = []
-    c=0
-    result = download_post()[0]
+    c = 0
+    f = open('link.txt', 'r')
+    result = f.read()
+    link = ast.literal_eval(result)
     s = Service(path)
-    driver = webdriver.Chrome(service=s,options=options)
+    driver = webdriver.Chrome(service=s, options=options)
     driver.implicitly_wait(3)
     driver.get('https://www.instagram.com/')
     time.sleep(5)
-    driver.find_element(By.NAME,'username').send_keys(sbaccount)
-    driver.find_element(By.NAME,'password').send_keys(sbpd)
+    driver.find_element(By.NAME, 'username').send_keys(sbaccount)
+    driver.find_element(By.NAME, 'password').send_keys(sbpd)
     time.sleep(3)
-    driver.find_element(By.XPATH,'//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
+    
+    driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
     driver.implicitly_wait(3)
-    driver.find_element(By.XPATH,'//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
+    
+    driver.find_element(By.XPATH, '//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
     time.sleep(3)
-    for url in result:
+    
+    for url in link:
         driver.get(f"https://www.instagram.com/{url}")
         soup = BeautifulSoup(driver.page_source, 'html.parser')
         hashtags = soup.find_all('a', class_='xil3i')
         for tag in hashtags:
             print(tag.text)
             collect_hashtag.append(tag.text)
-        c+=1
-        if c>10:
-            time.sleep(randint(5,10))
-            c=0
+        c += 1
+        if c > 10:
+            time.sleep(randint(5, 10))
+            c = 0
     driver.close()
     print(collect_hashtag)
     db_company = pymysql.connect(host='db.ptt.cx',
@@ -125,7 +128,7 @@ def hashtag():
     id_number = 0
     cur.execute("select * from seo.ig_pet order by `index` desc limit 1")
     last_id = cur.fetchall()[0][0]
-    insert_row = len(collect_hashtag)-last_id
+    insert_row = len(collect_hashtag) - last_id
     for i in range(0, insert_row):
         query_insert = f"INSERT INTO seo.ig_pet ({input_hashtag}) VALUES ('')"
         cur.execute(query_insert)
@@ -134,11 +137,11 @@ def hashtag():
         query_update = f"UPDATE seo.ig_pet SET {input_hashtag}='{i}' where `index`='{id_number}'"
         cur.execute(query_update)
         id_number += 1
-
+    
     db_company.commit()
     db_company.close()
     return hashtag
 
 
-
+download_post()
 hashtag()