zooeytsai 3 år sedan
förälder
incheckning
030dbf582f
1 ändrade filer med 64 tillägg och 63 borttagningar
  1. 64 63
      ig_tags/ig_selenium.py

+ 64 - 63
ig_tags/ig_selenium.py

@@ -12,131 +12,132 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support.wait import WebDriverWait
 from selenium.webdriver.support import expected_conditions as ec
 # import dataset
+import ast
 import pymysql
+
 pymysql.install_as_MySQLdb()
 import pd
-import setting
-
-account = ['chenlinrain','enjoylisteningswift','novak_goodenough']
+from setting import rua
 
-password = ['weareyoung12']
+account = pd.account
+password = pd.password
 
-path = "C:\\Users\\user\\Downloads\\chromedriver_win32"
-sbaccount = account[1]
+path = '/Users/zooeytsai/Downloads/chromedriver'
+sbaccount = account[2]
 sbpd = password[1]
 options = webdriver.ChromeOptions()
 # options.add_argument("--headless") # 視窗背後執行
-options.add_argument("user-agent=%s" % setting.rua())
+options.add_argument("user-agent=%s" % rua())
 
 
 def download_post():
-    hashtag = '寵物鮮食'
+    hashtag = '寵物蛋糕'
     url = f"https://www.instagram.com/explore/tags/{hashtag}/"
     s = Service(path)
     driver = webdriver.Chrome(service=s, options=options)
     driver.implicitly_wait(3)
     driver.get('https://www.instagram.com/')
     time.sleep(5)
-    driver.find_element(By.NAME,'username').send_keys(sbaccount)
-    driver.find_element(By.NAME,'password').send_keys(sbpd)
+    driver.find_element(By.NAME, 'username').send_keys(sbaccount)
+    driver.find_element(By.NAME, 'password').send_keys(sbpd)
     time.sleep(3)
-    driver.find_element(By.XPATH,'//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
-    driver.implicitly_wait(3)
-    driver.find_element(By.XPATH,'//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
+    driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
+    time.sleep(3)
+    driver.find_element(By.XPATH, '//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
     time.sleep(3)
     driver.get(url)
     time.sleep(3)
-    limit_of_posts = 1
-    limit_of_scroll = 3
+    limit_of_posts = 10
+    limit_of_scroll = 100
     c = 0
-    c_sroll=0
-    wait = WebDriverWait(driver, 10)
-    last_height = driver.execute_script(
-        "window.scrollTo(0, document.body.scrollHeight);var scrolldown=document.body.scrollHeight;return scrolldown;")
+    c_sroll = 0
     link = []
+    
     while True:
         print(c, c_sroll)
-
-        # href_element = wait.until(ec.visibility_of_all_elements_located((By.XPATH, "//a[@href]")))
-        # for i in href_element:
-        #     link.append(i.get_attribute('href'))
-        # print(link)
         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
         time.sleep(3)
-        # new_height = driver.execute_script("return document.body.scrollHeight")
-        # driver.implicitly_wait(3)
-        c+=1
-        c_sroll+=1
+        html = driver.page_source
+        soup = BeautifulSoup(html, 'lxml')
+        for elem in soup.select('article div div div div a'):
+            print(elem['href'])
+            if elem['href'] not in link:
+                link.append(elem['href'])
+        c += 1
+        c_sroll += 1
         if c > limit_of_posts:
             print('中場休息')
-            time.sleep(randint(15,30))
-            c=0
-        if c_sroll>limit_of_scroll:
+            time.sleep(randint(15, 30))
+            c = 0
+        if c_sroll > limit_of_scroll:
             break
-    html = driver.page_source
-    soup = BeautifulSoup(html, 'lxml')
-    for elem in soup.select('article div div div div a'):
-        if elem['href'] not in link:
-            link.append(elem['href'])
+        print(len(link))
     driver.close()
     print(len(link))
+    with open('link.txt', 'w') as f:
+        f.write(str(link))
     return link, hashtag
 
 
 def hashtag():
-    input_hashtag = 'pet_food'
+    input_hashtag = 'pet_cake'
     collect_hashtag = []
-    c=0
-    result = download_post()[0]
+    c = 0
+    f = open('link.txt', 'r')
+    result = f.read()
+    link = ast.literal_eval(result)
     s = Service(path)
-    driver = webdriver.Chrome(service=s,options=options)
+    driver = webdriver.Chrome(service=s, options=options)
     driver.implicitly_wait(3)
     driver.get('https://www.instagram.com/')
     time.sleep(5)
-    driver.find_element(By.NAME,'username').send_keys(sbaccount)
-    driver.find_element(By.NAME,'password').send_keys(sbpd)
+    driver.find_element(By.NAME, 'username').send_keys(sbaccount)
+    driver.find_element(By.NAME, 'password').send_keys(sbpd)
     time.sleep(3)
-    driver.find_element(By.XPATH,'//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
+    
+    driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
     driver.implicitly_wait(3)
-    driver.find_element(By.XPATH,'//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
+    
+    driver.find_element(By.XPATH, '//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
     time.sleep(3)
-    for url in result:
+    
+    for url in link:
         driver.get(f"https://www.instagram.com/{url}")
         soup = BeautifulSoup(driver.page_source, 'html.parser')
         hashtags = soup.find_all('a', class_='xil3i')
         for tag in hashtags:
             print(tag.text)
             collect_hashtag.append(tag.text)
-        c+=1
-        if c>10:
-            time.sleep(randint(5,10))
-            c=0
+        c += 1
+        if c > 10:
+            time.sleep(randint(5, 10))
+            c = 0
     driver.close()
     print(collect_hashtag)
-    db = pymysql.connect(host='db.ptt.cx',
-                         user='choozmo',
-                         password='pAssw0rd',
-                         database='seo')
-    cur = db.cursor()
+    db_company = pymysql.connect(host='db.ptt.cx',
+                                 user='choozmo',
+                                 password='pAssw0rd',
+                                 database='seo')
+    cur = db_company.cursor()
     query_new_col = f"ALTER TABLE seo.ig_pet ADD COLUMN {input_hashtag} VARCHAR(45) NULL"
     cur.execute(query_new_col)
-    id_number = 1
-    cur.execute("select * from ig_tags.new_table order by id desc limit 1")
+    id_number = 0
+    cur.execute("select * from seo.ig_pet order by `index` desc limit 1")
     last_id = cur.fetchall()[0][0]
-    insert_row = len(collect_hashtag)-last_id
+    insert_row = len(collect_hashtag) - last_id
     for i in range(0, insert_row):
-        query_insert = f"INSERT INTO ig_tags.new_table ({input_hashtag}) VALUES ('')"
+        query_insert = f"INSERT INTO seo.ig_pet ({input_hashtag}) VALUES ('')"
         cur.execute(query_insert)
     for i in collect_hashtag:
         i = i.replace('#', '')
-        query_update = f"UPDATE ig_tags.new_table SET {input_hashtag}='{i}' where id='{id_number}'"
+        query_update = f"UPDATE seo.ig_pet SET {input_hashtag}='{i}' where `index`='{id_number}'"
         cur.execute(query_update)
         id_number += 1
-
-    db.commit()
-    db.close()
+    
+    db_company.commit()
+    db_company.close()
     return hashtag
 
 
-
+download_post()
 hashtag()