zooeytsai 3 rokov pred
rodič
commit
3e24d43eeb
1 zmenil súbory, kde vykonal 39 pridanie a 17 odobranie
  1. 39 17
      ig_tags/ig_selenium.py

+ 39 - 17
ig_tags/ig_selenium.py

@@ -25,6 +25,7 @@ sbpd = pd[1]
 options = Options()
 options.add_argument("--headless")
 
+
 def download_post():
     hashtag = '寵物零食'
     url = f"https://www.instagram.com/explore/tags/{hashtag}/"
@@ -47,29 +48,39 @@ def download_post():
     driver.get(url)
     time.sleep(3)
     limit_of_posts = 10
+    limit_of_scroll = 250
     c = 0
+    c_sroll=0
     wait = WebDriverWait(driver, 10)
     last_height = driver.execute_script(
         "window.scrollTo(0, document.body.scrollHeight);var scrolldown=document.body.scrollHeight;return scrolldown;")
     link = []
     while True:
-        print(c)
-        href_element = wait.until(ec.visibility_of_all_elements_located((By.XPATH, "//a[@href]")))
-        for i in href_element:
-            link.append(i.get_attribute('href'))
-        print(link)
+        print(c, c_sroll)
+
+        # href_element = wait.until(ec.visibility_of_all_elements_located((By.XPATH, "//a[@href]")))
+        # for i in href_element:
+        #     link.append(i.get_attribute('href'))
+        # print(link)
         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
         time.sleep(3)
-        new_height = driver.execute_script("return document.body.scrollHeight")
-        driver.implicitly_wait(3)
+        # new_height = driver.execute_script("return document.body.scrollHeight")
+        # driver.implicitly_wait(3)
         c+=1
+        c_sroll+=1
         if c > limit_of_posts:
             print('中場休息')
-            time.sleep(randint(180,300))
+            time.sleep(randint(30,60))
             c=0
-        if len(link)>5000:
+        # if len(link)>500:
+        #     break
+        if c_sroll>limit_of_scroll:
             break
-
+    html = driver.page_source
+    soup = BeautifulSoup(html, 'lxml')
+    for elem in soup.select('article div div div div a'):
+        if elem['href'] not in link:
+            link.append(elem['href'])
     driver.close()
     print(len(link))
     return link
@@ -80,16 +91,25 @@ def hashtag():
     hashtag = []
     c=0
     result = download_post()
+    s = Service(path)
+    driver = webdriver.Chrome(service=s)
+    driver.implicitly_wait(3)
+    driver.get('https://www.instagram.com/')
+    time.sleep(5)
+    driver.find_element(By.NAME,'username').send_keys(sbaccount)
+    driver.find_element(By.NAME,'password').send_keys(sbpd)
+    time.sleep(3)
+    driver.find_element(By.XPATH,'//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
+    driver.implicitly_wait(3)
+    driver.find_element(By.XPATH,'//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
+    time.sleep(3)
     for url in result:
-        s = Service(path)
-        driver = webdriver.Chrome(service=s)
-        driver.implicitly_wait(3)
-        driver.get(url)
+        driver.get(f"https://www.instagram.com/{url}")
         soup = BeautifulSoup(driver.page_source, 'html.parser')
         hashtags = soup.find_all('a', class_='xil3i')
         for tag in hashtags:
             print(tag.text)
-            hashtag.append(tag)
+            hashtag.append(tag.text)
         c+=1
         if c>10:
             time.sleep(3)
@@ -101,9 +121,11 @@ def hashtag():
                          password='jondae350',
                          database='ig_tags')
     cur = db.cursor()
-    query = "ALTER TABLE ig_tags ADD taiwan_food TEXT(100)"
+    query = "ALTER TABLE ig_tags ADD health_product TEXT(100)"
+    id_number = 1
     for i in hashtag:
-        cur.execute('INSERT INTO ig_tags (taiwan_food) VALUES (%s)',i)
+        cur.execute(f'UPDATE ig_tags set snack={i} where id={id_number}')
+        id_number+=1
     db.commit()
     db.close()
     return hashtag