3 years ago · 53237d5811
--- a/ig_tags/ig_selenium.py
+++ b/ig_tags/ig_selenium.py
@@ -0,0 +1,96 @@
 
				+from selenium import webdriver
			
 
				+import random
			
 
				+import time
			
 
				+from requests.cookies import RequestsCookieJar
			
 
				+import requests
			
 
				+from requests.adapters import HTTPAdapter
			
 
				+from bs4 import  BeautifulSoup
			
 
				+import json
			
 
				+from selenium.webdriver.chrome.options import Options
			
 
				+from selenium.webdriver.chrome.service import Service
			
 
				+from selenium.webdriver.common.by import By
			
 
				+from selenium.webdriver.support.wait import WebDriverWait
			
 
				+from selenium.webdriver.support import expected_conditions as ec
			
 
				+import dataset
			
 
				+import pymysql
			
 
				+
			
 
				+
			
 
				+account = ['enjoylisteningswift']
			
 
				+pd = ['weareyoung12']
			
 
				+
			
 
				+path = '/Users/zooeytsai/Downloads/chromedriver'
			
 
				+sbaccount = account[2]
			
 
				+sbpd = pd[1]
			
 
				+options = Options()
			
 
				+options.add_argument("--headless")
			
 
				+
			
 
				+def download_post():
			
 
				+    hashtag = '台菜餐廳'
			
 
				+    url = f"https://www.instagram.com/explore/tags/{hashtag}/"
			
 
				+    s = Service(path)
			
 
				+    driver = webdriver.Chrome(service=s)
			
 
				+    driver.implicitly_wait(3)
			
 
				+    driver.get('https://www.instagram.com/')
			
 
				+    time.sleep(5)
			
 
				+    driver.find_element(By.NAME,'username').send_keys(sbaccount)
			
 
				+    driver.find_element(By.NAME,'password').send_keys(sbpd)
			
 
				+    time.sleep(3)
			
 
				+    driver.find_element(By.XPATH,'//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
			
 
				+    driver.implicitly_wait(3)
			
 
				+    driver.find_element(By.XPATH,'//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
			
 
				+    time.sleep(3)
			
 
				+    cookie = driver.get_cookies()
			
 
				+    jar = RequestsCookieJar()
			
 
				+    for i in cookie:
			
 
				+        jar.set(i['name'], i['value'])
			
 
				+    driver.get(url)
			
 
				+    time.sleep(3)
			
 
				+    number_of_posts = 100
			
 
				+    wait = WebDriverWait(driver, 10)
			
 
				+    last_height = driver.execute_script(
			
 
				+        "window.scrollTo(0, document.body.scrollHeight);var scrolldown=document.body.scrollHeight;return scrolldown;")
			
 
				+    link = []
			
 
				+    while True:
			
 
				+        href_element = wait.until(ec.visibility_of_all_elements_located((By.XPATH, "//a[@href]")))
			
 
				+        for i in href_element:
			
 
				+            link.append(i.get_attribute('href'))
			
 
				+        print(link)
			
 
				+        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
			
 
				+        time.sleep(3)
			
 
				+        new_height = driver.execute_script("return document.body.scrollHeight")
			
 
				+        driver.implicitly_wait(3)
			
 
				+        number_of_posts+=1
			
 
				+        if len(link)>number_of_posts:
			
 
				+            time.sleep(random(180,300))
			
 
				+            number_of_posts == 0
			
 
				+        if new_height == last_height:
			
 
				+            break
			
 
				+
			
 
				+    driver.close()
			
 
				+    print(len(link))
			
 
				+    return link
			
 
				+
			
 
				+db = dataset.connect('mysql://root:jondae350@localhost:3306/ig_tags?charset=utf8mb4')
			
 
				+table = db['ig_tags']
			
 
				+def hashtag():
			
 
				+    hashtag = []
			
 
				+    for url in download_post():
			
 
				+        s = Service(path)
			
 
				+        driver = webdriver.Chrome(service=s)
			
 
				+        driver.implicitly_wait(3)
			
 
				+        driver.get(url)
			
 
				+        soup = BeautifulSoup(driver.page_source, 'html.parser')
			
 
				+        hashtags = soup.find_all('a', class_='xil3i')
			
 
				+        for tag in hashtags:
			
 
				+            print(tag.text)
			
 
				+            hashtag.append(tag)
			
 
				+    driver.close()
			
 
				+    cur = db.cursor()
			
 
				+    table.create_column('taiwan_food',db.types.text)
			
 
				+    values = [list([item]) for item in hashtag]
			
 
				+    cur.executemany('INSERT INTO ig_tags (taiwan_food) VALUES (%s)',values)
			
 
				+    return hashtag
			
 
				+
			
 
				+
			
 
				+download_post()
			
 
				+hashtag()