Browse Source

ig hashtags selenium

zooeytsai 3 years ago
parent
commit
53237d5811
1 changed files with 96 additions and 0 deletions
  1. 96 0
      ig_tags/ig_selenium.py

+ 96 - 0
ig_tags/ig_selenium.py

@@ -0,0 +1,96 @@
+from selenium import webdriver
+import random
+import time
+from requests.cookies import RequestsCookieJar
+import requests
+from requests.adapters import HTTPAdapter
+from bs4 import  BeautifulSoup
+import json
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.support import expected_conditions as ec
+import dataset
+import pymysql
+
+
+account = ['enjoylisteningswift']
+pd = ['weareyoung12']
+
+path = '/Users/zooeytsai/Downloads/chromedriver'
+sbaccount = account[2]
+sbpd = pd[1]
+options = Options()
+options.add_argument("--headless")
+
+def download_post():
+    hashtag = '台菜餐廳'
+    url = f"https://www.instagram.com/explore/tags/{hashtag}/"
+    s = Service(path)
+    driver = webdriver.Chrome(service=s)
+    driver.implicitly_wait(3)
+    driver.get('https://www.instagram.com/')
+    time.sleep(5)
+    driver.find_element(By.NAME,'username').send_keys(sbaccount)
+    driver.find_element(By.NAME,'password').send_keys(sbpd)
+    time.sleep(3)
+    driver.find_element(By.XPATH,'//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
+    driver.implicitly_wait(3)
+    driver.find_element(By.XPATH,'//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
+    time.sleep(3)
+    cookie = driver.get_cookies()
+    jar = RequestsCookieJar()
+    for i in cookie:
+        jar.set(i['name'], i['value'])
+    driver.get(url)
+    time.sleep(3)
+    number_of_posts = 100
+    wait = WebDriverWait(driver, 10)
+    last_height = driver.execute_script(
+        "window.scrollTo(0, document.body.scrollHeight);var scrolldown=document.body.scrollHeight;return scrolldown;")
+    link = []
+    while True:
+        href_element = wait.until(ec.visibility_of_all_elements_located((By.XPATH, "//a[@href]")))
+        for i in href_element:
+            link.append(i.get_attribute('href'))
+        print(link)
+        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+        time.sleep(3)
+        new_height = driver.execute_script("return document.body.scrollHeight")
+        driver.implicitly_wait(3)
+        number_of_posts+=1
+        if len(link)>number_of_posts:
+            time.sleep(random(180,300))
+            number_of_posts == 0
+        if new_height == last_height:
+            break
+
+    driver.close()
+    print(len(link))
+    return link
+
+db = dataset.connect('mysql://root:jondae350@localhost:3306/ig_tags?charset=utf8mb4')
+table = db['ig_tags']
+def hashtag():
+    hashtag = []
+    for url in download_post():
+        s = Service(path)
+        driver = webdriver.Chrome(service=s)
+        driver.implicitly_wait(3)
+        driver.get(url)
+        soup = BeautifulSoup(driver.page_source, 'html.parser')
+        hashtags = soup.find_all('a', class_='xil3i')
+        for tag in hashtags:
+            print(tag.text)
+            hashtag.append(tag)
+    driver.close()
+    cur = db.cursor()
+    table.create_column('taiwan_food',db.types.text)
+    values = [list([item]) for item in hashtag]
+    cur.executemany('INSERT INTO ig_tags (taiwan_food) VALUES (%s)',values)
+    return hashtag
+
+
+download_post()
+hashtag()