Ver código fonte

Merge branch 'master' of http://git.choozmo.com:3000/choozmo/kw_tools

Jared 3 anos atrás
pai
commit
4a64f54488
1 arquivos alterados com 71 adições e 46 exclusões
  1. 71 46
      ig_tags/ig_selenium.py

+ 71 - 46
ig_tags/ig_selenium.py

@@ -12,24 +12,28 @@ from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.by import By
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.wait import WebDriverWait
 from selenium.webdriver.support.wait import WebDriverWait
 from selenium.webdriver.support import expected_conditions as ec
 from selenium.webdriver.support import expected_conditions as ec
-# import dataset
+import pandas
+import os
 import ast
 import ast
 import pymysql
 import pymysql
-pymysql.install_as_MySQLdb()
 
 
+pymysql.install_as_MySQLdb()
+import pd
+from setting import rua
 
 
-account = ['chenlinrain','aruikuwasaki','enjoylisteningswift','novak_goodenough']
+account = ['chenlinrain', 'aruikuwasaki', 'enjoylisteningswift', 'novak_goodenough']
 password = ['weareyoung12']
 password = ['weareyoung12']
 
 
 path = 'C:\\Users\\user\\Downloads\\chromedriver_99\\chromedriver'
 path = 'C:\\Users\\user\\Downloads\\chromedriver_99\\chromedriver'
 
 
 options = webdriver.ChromeOptions()
 options = webdriver.ChromeOptions()
-options.add_argument("--headless") # 視窗背後執行
+options.add_argument("--headless")  # 視窗背後執行
 options.add_argument("user-agent=%s" % rua())
 options.add_argument("user-agent=%s" % rua())
 
 
+hashtag = '上課平台'
+
 
 
 def download_post():
 def download_post():
-    hashtag = '上課平台'
     url = f"https://www.instagram.com/explore/tags/{hashtag}/"
     url = f"https://www.instagram.com/explore/tags/{hashtag}/"
     s = Service(path)
     s = Service(path)
     driver = webdriver.Chrome(service=s, options=options)
     driver = webdriver.Chrome(service=s, options=options)
@@ -38,19 +42,19 @@ def download_post():
     time.sleep(5)
     time.sleep(5)
     sbaccount = account[4]
     sbaccount = account[4]
     sbpd = password[0]
     sbpd = password[0]
-    driver.find_element(By.NAME,'username').send_keys(sbaccount)
-    driver.find_element(By.NAME,'password').send_keys(sbpd)
+    driver.find_element(By.NAME, 'username').send_keys(sbaccount)
+    driver.find_element(By.NAME, 'password').send_keys(sbpd)
     time.sleep(3)
     time.sleep(3)
-    driver.find_element(By.XPATH,'//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
+    driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
     time.sleep(3)
     time.sleep(3)
-    driver.find_element(By.XPATH,'//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
+    driver.find_element(By.XPATH, '//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
     time.sleep(3)
     time.sleep(3)
     driver.get(url)
     driver.get(url)
     time.sleep(3)
     time.sleep(3)
     limit_of_posts = 10
     limit_of_posts = 10
     limit_of_scroll = 100
     limit_of_scroll = 100
     c = 0
     c = 0
-    c_sroll=0
+    c_sroll = 0
     link = []
     link = []
     last_height = driver.execute_script("return document.body.scrollHeight")
     last_height = driver.execute_script("return document.body.scrollHeight")
     while True:
     while True:
@@ -63,50 +67,59 @@ def download_post():
             print(elem['href'])
             print(elem['href'])
             if elem['href'] not in link:
             if elem['href'] not in link:
                 link.append(elem['href'])
                 link.append(elem['href'])
-        c+=1
-        c_sroll+=1
+        c += 1
+        c_sroll += 1
         new_height = driver.execute_script("return document.body.scrollHeight")
         new_height = driver.execute_script("return document.body.scrollHeight")
-        if c > limit_of_posts:
-            print('中場休息')
-            time.sleep(randint(15,30))
-            c=0
         if new_height == last_height:
         if new_height == last_height:
+            print('沒有更多文章了')
             break
             break
-        if c_sroll>limit_of_scroll:
+        if c_sroll > limit_of_scroll:
             break
             break
+        if c > limit_of_posts:
+            print('中場休息')
+            time.sleep(randint(15, 30))
+            c = 0
+    
     driver.close()
     driver.close()
-    print('post總數',len(link))
-    with open('link.txt','w') as f:
+    print('post總數', len(link))
+    with open('link.txt', 'w') as f:
         f.write(str(link))
         f.write(str(link))
     return link, hashtag
     return link, hashtag
 
 
 
 
+db_company = pymysql.connect(host='db.ptt.cx',
+                             user='choozmo',
+                             password='pAssw0rd',
+                             database='seo')
+
+input_hashtag = ''
+
+
 def hashtag():
 def hashtag():
-    input_hashtag = 'online_platform'
     collect_hashtag = []
     collect_hashtag = []
-    c=0
+    c = 0
     rest = 0
     rest = 0
-    f = open('link.txt','r')
+    f = open('link.txt', 'r')
     result = f.read()
     result = f.read()
     link = ast.literal_eval(result)
     link = ast.literal_eval(result)
     s = Service(path)
     s = Service(path)
-    driver = webdriver.Chrome(service=s,options=options)
+    driver = webdriver.Chrome(service=s, options=options)
     driver.implicitly_wait(3)
     driver.implicitly_wait(3)
     driver.get('https://www.instagram.com/')
     driver.get('https://www.instagram.com/')
     time.sleep(5)
     time.sleep(5)
     sbaccount = account[1]
     sbaccount = account[1]
     sbpd = password[0]
     sbpd = password[0]
-    driver.find_element(By.NAME,'username').send_keys(sbaccount)
-    driver.find_element(By.NAME,'password').send_keys(sbpd)
+    driver.find_element(By.NAME, 'username').send_keys(sbaccount)
+    driver.find_element(By.NAME, 'password').send_keys(sbpd)
     time.sleep(3)
     time.sleep(3)
-
-    driver.find_element(By.XPATH,'//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
+    
+    driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
     driver.implicitly_wait(3)
     driver.implicitly_wait(3)
-
-    driver.find_element(By.XPATH,'//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
+    
+    driver.find_element(By.XPATH, '//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
     time.sleep(3)
     time.sleep(3)
-
-    for i,url in enumerate(link):
+    
+    for i, url in enumerate(link):
         print(i)
         print(i)
         driver.get(f"https://www.instagram.com/{url}")
         driver.get(f"https://www.instagram.com/{url}")
         soup = BeautifulSoup(driver.page_source, 'html.parser')
         soup = BeautifulSoup(driver.page_source, 'html.parser')
@@ -114,28 +127,25 @@ def hashtag():
         for tag in hashtags:
         for tag in hashtags:
             collect_hashtag.append(tag.text)
             collect_hashtag.append(tag.text)
             print(tag.text)
             print(tag.text)
-        c+=1
+        c += 1
         rest += 1
         rest += 1
-        if c>10:
-            time.sleep(randint(5,10))
-            c=0
-        if rest>100:
-            time.sleep(randint(60,90))
-            rest=0
+        if c > 10:
+            time.sleep(randint(5, 10))
+            c = 0
+        if rest > 100:
+            time.sleep(randint(60, 90))
+            rest = 0
     driver.close()
     driver.close()
     print(collect_hashtag)
     print(collect_hashtag)
     print('開始寫入db')
     print('開始寫入db')
-    db_company = pymysql.connect(host='db.ptt.cx',
-                                 user='choozmo',
-                                 password='pAssw0rd',
-                                 database='seo')
+    
     cur = db_company.cursor()
     cur = db_company.cursor()
     query_new_col = f"ALTER TABLE seo.ig_pet_2 ADD COLUMN {input_hashtag} VARCHAR(45) NULL"
     query_new_col = f"ALTER TABLE seo.ig_pet_2 ADD COLUMN {input_hashtag} VARCHAR(45) NULL"
     cur.execute(query_new_col)
     cur.execute(query_new_col)
     id_number = 0
     id_number = 0
     cur.execute("select * from seo.ig_pet_2 order by `id` desc limit 1")
     cur.execute("select * from seo.ig_pet_2 order by `id` desc limit 1")
     last_id = cur.fetchall()[0][0]
     last_id = cur.fetchall()[0][0]
-    insert_row = len(collect_hashtag)-last_id
+    insert_row = len(collect_hashtag) - last_id
     for i in range(0, insert_row):
     for i in range(0, insert_row):
         query_insert = f"INSERT INTO seo.ig_pet_2 ({input_hashtag}) VALUES ('')"
         query_insert = f"INSERT INTO seo.ig_pet_2 ({input_hashtag}) VALUES ('')"
         cur.execute(query_insert)
         cur.execute(query_insert)
@@ -143,15 +153,30 @@ def hashtag():
         print(i)
         print(i)
     for i in collect_hashtag:
     for i in collect_hashtag:
         i = i.replace('#', '')
         i = i.replace('#', '')
-        print(i,id_number)
+        print(i, id_number)
         query_update = f"UPDATE seo.ig_pet_2 SET {input_hashtag}='{i}' where `id`='{id_number}'"
         query_update = f"UPDATE seo.ig_pet_2 SET {input_hashtag}='{i}' where `id`='{id_number}'"
         cur.execute(query_update)
         cur.execute(query_update)
         db_company.commit()
         db_company.commit()
         id_number += 1
         id_number += 1
-    print('post總數',len(link))
+    print('post總數', len(link))
     db_company.close()
     db_company.close()
     return hashtag
     return hashtag
 
 
 
 
+def create_csv():
+    save_file_path = os.getcwd()
+    
+    query = f"SELECT `{input_hashtag}`, COUNT(*) FROM seo.ig_pet_2 WHERE `{input_hashtag}` IS NOT Null GROUP BY `{input_hashtag}`"
+    cur = db_company.cursor()
+    cur.execute(query)
+    data = cur.fetchall()
+    tuple_to_list = [list(x) for x in data]
+    df = pandas.DataFrame(tuple_to_list)
+    df.columns = [input_hashtag, '計數']
+    df = df.sort_values(by=['計數'], ascending=False)
+    df.to_csv(f'{save_file_path}/hashtag_file/{hashtag}.csv', index=False)
+
+
 download_post()
 download_post()
-hashtag()
+hashtag()
+create_csv()