|
@@ -0,0 +1,109 @@
|
|
|
+from selenium import webdriver
|
|
|
+from random import randint
|
|
|
+import time
|
|
|
+from requests.cookies import RequestsCookieJar
|
|
|
+import requests
|
|
|
+from requests.adapters import HTTPAdapter
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+import json
|
|
|
+from selenium.webdriver.chrome.options import Options
|
|
|
+from selenium.webdriver.chrome.service import Service
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
+from selenium.webdriver.support.wait import WebDriverWait
|
|
|
+from selenium.webdriver.support import expected_conditions as ec
|
|
|
+import dataset
|
|
|
+import pymysql
|
|
|
+pymysql.install_as_MySQLdb()
|
|
|
+
|
|
|
+
|
|
|
+account = ['chenlinrain']
|
|
|
+pd = ['weareyoung12']
|
|
|
+
|
|
|
+path = '/Users/zooeytsai/Downloads/chromedriver'
|
|
|
+sbaccount = account[2]
|
|
|
+sbpd = pd[1]
|
|
|
+options = Options()
|
|
|
+options.add_argument("--headless")
|
|
|
+
|
|
|
+def download_post():
|
|
|
+ hashtag = '寵物'
|
|
|
+ url = f"https://www.instagram.com/explore/tags/{hashtag}/"
|
|
|
+ s = Service(path)
|
|
|
+ driver = webdriver.Chrome(service=s)
|
|
|
+ driver.implicitly_wait(3)
|
|
|
+ driver.get('https://www.instagram.com/')
|
|
|
+ time.sleep(5)
|
|
|
+ driver.find_element(By.NAME,'username').send_keys(sbaccount)
|
|
|
+ driver.find_element(By.NAME,'password').send_keys(sbpd)
|
|
|
+ time.sleep(3)
|
|
|
+ driver.find_element(By.XPATH,'//*[@id="loginForm"]/div/div[3]/button').click() # 登入
|
|
|
+ driver.implicitly_wait(3)
|
|
|
+ driver.find_element(By.XPATH,'//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
|
|
|
+ time.sleep(3)
|
|
|
+ cookie = driver.get_cookies()
|
|
|
+ jar = RequestsCookieJar()
|
|
|
+ for i in cookie:
|
|
|
+ jar.set(i['name'], i['value'])
|
|
|
+ driver.get(url)
|
|
|
+ time.sleep(3)
|
|
|
+ number_of_posts = 100
|
|
|
+ wait = WebDriverWait(driver, 10)
|
|
|
+ last_height = driver.execute_script(
|
|
|
+ "window.scrollTo(0, document.body.scrollHeight);var scrolldown=document.body.scrollHeight;return scrolldown;")
|
|
|
+ link = []
|
|
|
+ while True:
|
|
|
+ href_element = wait.until(ec.visibility_of_all_elements_located((By.XPATH, "//a[@href]")))
|
|
|
+ for i in href_element:
|
|
|
+ link.append(i.get_attribute('href'))
|
|
|
+ print(link)
|
|
|
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
|
|
+ time.sleep(3)
|
|
|
+ new_height = driver.execute_script("return document.body.scrollHeight")
|
|
|
+ driver.implicitly_wait(3)
|
|
|
+ number_of_posts+=1
|
|
|
+ if len(link)>number_of_posts:
|
|
|
+ time.sleep(randint(180,300))
|
|
|
+ number_of_posts == 0
|
|
|
+ if len(link)>10000:
|
|
|
+ break
|
|
|
+
|
|
|
+ driver.close()
|
|
|
+ print(len(link))
|
|
|
+ return link
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def hashtag():
|
|
|
+ hashtag = []
|
|
|
+ c=0
|
|
|
+ for url in download_post():
|
|
|
+ s = Service(path)
|
|
|
+ driver = webdriver.Chrome(service=s)
|
|
|
+ driver.implicitly_wait(3)
|
|
|
+ driver.get(url)
|
|
|
+ soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
+ hashtags = soup.find_all('a', class_='xil3i')
|
|
|
+ for tag in hashtags:
|
|
|
+ print(tag.text)
|
|
|
+ hashtag.append(tag)
|
|
|
+ c+=1
|
|
|
+ if c>10:
|
|
|
+ time.sleep(3)
|
|
|
+ c=0
|
|
|
+ driver.close()
|
|
|
+
|
|
|
+ db = pymysql.connect(host='localhost',
|
|
|
+ user='root',
|
|
|
+ password='jondae350',
|
|
|
+ database='ig_tags')
|
|
|
+ cur = db.cursor()
|
|
|
+ query = "ALTER TABLE ig_tags ADD taiwan_food TEXT(100)"
|
|
|
+ for i in hashtag:
|
|
|
+ cur.execute('INSERT INTO ig_tags (taiwan_food) VALUES (%s)',i)
|
|
|
+ db.commit()
|
|
|
+ db.close()
|
|
|
+ return hashtag
|
|
|
+
|
|
|
+
|
|
|
+download_post()
|
|
|
+hashtag()
|