Browse Source

Merge branch 'master' of http://git.choozmo.com:3000/choozmo/kw_tools

Jared 3 years ago
parent
commit
ae2ddc0142
1 changed files with 109 additions and 0 deletions
  1. 109 0
      ig_tags/ig_selenium.py

+ 109 - 0
ig_tags/ig_selenium.py

@@ -0,0 +1,109 @@
+from selenium import webdriver
+from random import randint
+import time
+from requests.cookies import RequestsCookieJar
+import requests
+from requests.adapters import HTTPAdapter
+from bs4 import BeautifulSoup
+import json
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.support import expected_conditions as ec
+import dataset
+import pymysql
+pymysql.install_as_MySQLdb()
+
+
+account = ['chenlinrain']
+pd = ['weareyoung12']
+
+path = '/Users/zooeytsai/Downloads/chromedriver'
+sbaccount = account[2]
+sbpd = pd[1]
+options = Options()
+options.add_argument("--headless")
+
+def download_post():
+    hashtag = '寵物'
+    url = f"https://www.instagram.com/explore/tags/{hashtag}/"
+    s = Service(path)
+    driver = webdriver.Chrome(service=s)
+    driver.implicitly_wait(3)
+    driver.get('https://www.instagram.com/')
+    time.sleep(5)
+    driver.find_element(By.NAME,'username').send_keys(sbaccount)
+    driver.find_element(By.NAME,'password').send_keys(sbpd)
+    time.sleep(3)
+    driver.find_element(By.XPATH,'//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
+    driver.implicitly_wait(3)
+    driver.find_element(By.XPATH,'//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
+    time.sleep(3)
+    cookie = driver.get_cookies()
+    jar = RequestsCookieJar()
+    for i in cookie:
+        jar.set(i['name'], i['value'])
+    driver.get(url)
+    time.sleep(3)
+    number_of_posts = 100
+    wait = WebDriverWait(driver, 10)
+    last_height = driver.execute_script(
+        "window.scrollTo(0, document.body.scrollHeight);var scrolldown=document.body.scrollHeight;return scrolldown;")
+    link = []
+    while True:
+        href_element = wait.until(ec.visibility_of_all_elements_located((By.XPATH, "//a[@href]")))
+        for i in href_element:
+            link.append(i.get_attribute('href'))
+        print(link)
+        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+        time.sleep(3)
+        new_height = driver.execute_script("return document.body.scrollHeight")
+        driver.implicitly_wait(3)
+        number_of_posts+=1
+        if len(link)>number_of_posts:
+            time.sleep(randint(180,300))
+            number_of_posts == 0
+        if len(link)>10000:
+            break
+
+    driver.close()
+    print(len(link))
+    return link
+
+
+
+def hashtag():
+    hashtag = []
+    c=0
+    for url in download_post():
+        s = Service(path)
+        driver = webdriver.Chrome(service=s)
+        driver.implicitly_wait(3)
+        driver.get(url)
+        soup = BeautifulSoup(driver.page_source, 'html.parser')
+        hashtags = soup.find_all('a', class_='xil3i')
+        for tag in hashtags:
+            print(tag.text)
+            hashtag.append(tag)
+        c+=1
+        if c>10:
+            time.sleep(3)
+            c=0
+    driver.close()
+
+    db = pymysql.connect(host='localhost',
+                         user='root',
+                         password='jondae350',
+                         database='ig_tags')
+    cur = db.cursor()
+    query = "ALTER TABLE ig_tags ADD taiwan_food TEXT(100)"
+    for i in hashtag:
+        cur.execute('INSERT INTO ig_tags (taiwan_food) VALUES (%s)',i)
+    db.commit()
+    db.close()
+    return hashtag
+
+
+download_post()
+hashtag()