Sfoglia il codice sorgente

調整視窗下拉

zooeytsai 3 anni fa
parent
commit
6f1b7a855a
1 ha cambiato i file con 60 aggiunte e 50 eliminazioni
  1. 60 50
      ig_tags/ig_selenium.py

+ 60 - 50
ig_tags/ig_selenium.py

@@ -1,3 +1,4 @@
+# coding:utf-8
 from selenium import webdriver
 from random import randint
 import time
@@ -14,49 +15,44 @@ from selenium.webdriver.support import expected_conditions as ec
 # import dataset
 import ast
 import pymysql
-
 pymysql.install_as_MySQLdb()
 
-import pd
-import setting
-
 
-account = ['chenlinrain','enjoylisteningswift','novak_goodenough']
+account = ['chenlinrain','aruikuwasaki','enjoylisteningswift','novak_goodenough']
+password = ['weareyoung12']
 
-account = pd.account
-password = pd.password
+path = 'C:\\Users\\user\\Downloads\\chromedriver_99\\chromedriver'
 
-path = '/Users/zooeytsai/Downloads/chromedriver'
-sbaccount = account[2]
-sbpd = password[1]
 options = webdriver.ChromeOptions()
-# options.add_argument("--headless") # 視窗背後執行
+options.add_argument("--headless") # 視窗背後執行
 options.add_argument("user-agent=%s" % rua())
 
 
 def download_post():
-    hashtag = '寵物蛋糕'
+    hashtag = '上課平台'
     url = f"https://www.instagram.com/explore/tags/{hashtag}/"
     s = Service(path)
     driver = webdriver.Chrome(service=s, options=options)
     driver.implicitly_wait(3)
     driver.get('https://www.instagram.com/')
     time.sleep(5)
-    driver.find_element(By.NAME, 'username').send_keys(sbaccount)
-    driver.find_element(By.NAME, 'password').send_keys(sbpd)
+    sbaccount = account[4]
+    sbpd = password[0]
+    driver.find_element(By.NAME,'username').send_keys(sbaccount)
+    driver.find_element(By.NAME,'password').send_keys(sbpd)
     time.sleep(3)
-    driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
+    driver.find_element(By.XPATH,'//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
     time.sleep(3)
-    driver.find_element(By.XPATH, '//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
+    driver.find_element(By.XPATH,'//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
     time.sleep(3)
     driver.get(url)
     time.sleep(3)
     limit_of_posts = 10
     limit_of_scroll = 100
     c = 0
-    c_sroll = 0
+    c_sroll=0
     link = []
-    
+    last_height = driver.execute_script("return document.body.scrollHeight")
     while True:
         print(c, c_sroll)
         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
@@ -67,78 +63,92 @@ def download_post():
             print(elem['href'])
             if elem['href'] not in link:
                 link.append(elem['href'])
-        c += 1
-        c_sroll += 1
+        c+=1
+        c_sroll+=1
+        new_height = driver.execute_script("return document.body.scrollHeight")
         if c > limit_of_posts:
             print('中場休息')
-            time.sleep(randint(15, 30))
-            c = 0
-        if c_sroll > limit_of_scroll:
+            time.sleep(randint(15,30))
+            c=0
+        if new_height == last_height:
+            break
+        if c_sroll>limit_of_scroll:
             break
-        print(len(link))
     driver.close()
-    print(len(link))
-    with open('link.txt', 'w') as f:
+    print('post總數',len(link))
+    with open('link.txt','w') as f:
         f.write(str(link))
     return link, hashtag
 
 
 def hashtag():
-    input_hashtag = 'pet_cake'
+    input_hashtag = 'online_platform'
     collect_hashtag = []
-    c = 0
-    f = open('link.txt', 'r')
+    c=0
+    rest = 0
+    f = open('link.txt','r')
     result = f.read()
     link = ast.literal_eval(result)
     s = Service(path)
-    driver = webdriver.Chrome(service=s, options=options)
+    driver = webdriver.Chrome(service=s,options=options)
     driver.implicitly_wait(3)
     driver.get('https://www.instagram.com/')
     time.sleep(5)
-    driver.find_element(By.NAME, 'username').send_keys(sbaccount)
-    driver.find_element(By.NAME, 'password').send_keys(sbpd)
+    sbaccount = account[1]
+    sbpd = password[0]
+    driver.find_element(By.NAME,'username').send_keys(sbaccount)
+    driver.find_element(By.NAME,'password').send_keys(sbpd)
     time.sleep(3)
-    
-    driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
+
+    driver.find_element(By.XPATH,'//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
     driver.implicitly_wait(3)
-    
-    driver.find_element(By.XPATH, '//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
+
+    driver.find_element(By.XPATH,'//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
     time.sleep(3)
-    
-    for url in link:
+
+    for i,url in enumerate(link):
+        print(i)
         driver.get(f"https://www.instagram.com/{url}")
         soup = BeautifulSoup(driver.page_source, 'html.parser')
         hashtags = soup.find_all('a', class_='xil3i')
         for tag in hashtags:
-            print(tag.text)
             collect_hashtag.append(tag.text)
-        c += 1
-        if c > 10:
-            time.sleep(randint(5, 10))
-            c = 0
+            print(tag.text)
+        c+=1
+        rest += 1
+        if c>10:
+            time.sleep(randint(5,10))
+            c=0
+        if rest>100:
+            time.sleep(randint(60,90))
+            rest=0
     driver.close()
     print(collect_hashtag)
+    print('開始寫入db')
     db_company = pymysql.connect(host='db.ptt.cx',
                                  user='choozmo',
                                  password='pAssw0rd',
                                  database='seo')
     cur = db_company.cursor()
-    query_new_col = f"ALTER TABLE seo.ig_pet ADD COLUMN {input_hashtag} VARCHAR(45) NULL"
+    query_new_col = f"ALTER TABLE seo.ig_pet_2 ADD COLUMN {input_hashtag} VARCHAR(45) NULL"
     cur.execute(query_new_col)
     id_number = 0
-    cur.execute("select * from seo.ig_pet order by `index` desc limit 1")
+    cur.execute("select * from seo.ig_pet_2 order by `id` desc limit 1")
     last_id = cur.fetchall()[0][0]
-    insert_row = len(collect_hashtag) - last_id
+    insert_row = len(collect_hashtag)-last_id
     for i in range(0, insert_row):
-        query_insert = f"INSERT INTO seo.ig_pet ({input_hashtag}) VALUES ('')"
+        query_insert = f"INSERT INTO seo.ig_pet_2 ({input_hashtag}) VALUES ('')"
         cur.execute(query_insert)
+        db_company.commit()
+        print(i)
     for i in collect_hashtag:
         i = i.replace('#', '')
-        query_update = f"UPDATE seo.ig_pet SET {input_hashtag}='{i}' where `index`='{id_number}'"
+        print(i,id_number)
+        query_update = f"UPDATE seo.ig_pet_2 SET {input_hashtag}='{i}' where `id`='{id_number}'"
         cur.execute(query_update)
+        db_company.commit()
         id_number += 1
-    
-    db_company.commit()
+    print('post總數',len(link))
     db_company.close()
     return hashtag