Bläddra i källkod

Merge branch 'master' of http://git.choozmo.com:3000/choozmo/kw_tools

Jared 3 år sedan
förälder
incheckning
b63b239310
5 ändrade filer med 138 tillägg och 58 borttagningar
  1. 27 0
      2000ece3-455d-4cb5-b20e-ad798cf8bf23.json
  2. 96 58
      ig_tags/ig_selenium.py
  3. 2 0
      ig_tags/pd.py
  4. 13 0
      ig_tags/setting.py
  5. 0 0
      ig_tags/tags.py

+ 27 - 0
2000ece3-455d-4cb5-b20e-ad798cf8bf23.json

@@ -0,0 +1,27 @@
+
+    {
+       "access_token":"ya29.A0ARrdaM-8J7urxGZFJBi4jt4ORkrCofDLMI3SN_jNfFjt5HlRZnlQ_FCRqQNZupdR0HWhAgzOE92p-AjAaBpKwklGQGxM5m3byAjQsr8qHr237p1lsdWif0kffHt2wpNxowuy5UyrzxbrSsj0zmuzZ5JgthRd",
+       "client_id":"184319941539-gdh6p4v400g0f5fj076bp7l3cf7vn7ha.apps.googleusercontent.com",
+       "client_secret":"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",
+       "refresh_token":"1//0e3GKAc0Xl1V1CgYIARAAGA4SNwF-L9IrMmaPcxVGEA6J-yYeVzx8l9C3op0oiga7Ouw-_b7hv3enOhPwEixoH7pY3efL_aY6cSw",
+       "token_expiry":"2020-10-27T18:03:48Z",
+       "token_uri":"https://accounts.google.com/o/oauth2/token",
+       "user_agent":null,
+       "revoke_uri":"https://oauth2.googleapis.com/revoke",
+       "id_token":null,
+       "id_token_jwt":null,
+       "token_response":{
+          "access_token":"1//0e3GKAc0Xl1V1CgYIARAAGA4SNwF-L9IrMmaPcxVGEA6J-yYeVzx8l9C3op0oiga7Ouw-_b7hv3enOhPwEixoH7pY3efL_aY6cSw",
+          "expires_in": 3599,
+          "scope":"https://www.googleapis.com/auth/youtube.upload",
+          "token_type":"Bearer"
+       },
+       "scopes":[
+          "https://www.googleapis.com/auth/youtube.upload"
+       ],
+       "token_info_uri":"https://oauth2.googleapis.com/tokeninfo",
+       "invalid":false,
+       "_class":"OAuth2Credentials",
+       "_module":"oauth2client.client"
+    }
+            

+ 96 - 58
ig_tags/ig_selenium.py

@@ -11,97 +11,135 @@ from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.wait import WebDriverWait
 from selenium.webdriver.support import expected_conditions as ec
-import dataset
+# import dataset
+import ast
 import pymysql
+
 pymysql.install_as_MySQLdb()
 
+import pd
+import setting
+
 
-account = ['chenlinrain']
-pd = ['weareyoung12']
+account = ['chenlinrain','enjoylisteningswift','novak_goodenough']
+
+account = pd.account
+password = pd.password
 
 path = '/Users/zooeytsai/Downloads/chromedriver'
 sbaccount = account[2]
-sbpd = pd[1]
-options = Options()
-options.add_argument("--headless")
+sbpd = password[1]
+options = webdriver.ChromeOptions()
+# options.add_argument("--headless") # 視窗背後執行
+options.add_argument("user-agent=%s" % rua())
+
 
 def download_post():
-    hashtag = '寵物'
+    hashtag = '寵物蛋糕'
     url = f"https://www.instagram.com/explore/tags/{hashtag}/"
     s = Service(path)
-    driver = webdriver.Chrome(service=s)
+    driver = webdriver.Chrome(service=s, options=options)
     driver.implicitly_wait(3)
     driver.get('https://www.instagram.com/')
     time.sleep(5)
-    driver.find_element(By.NAME,'username').send_keys(sbaccount)
-    driver.find_element(By.NAME,'password').send_keys(sbpd)
+    driver.find_element(By.NAME, 'username').send_keys(sbaccount)
+    driver.find_element(By.NAME, 'password').send_keys(sbpd)
     time.sleep(3)
-    driver.find_element(By.XPATH,'//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
-    driver.implicitly_wait(3)
-    driver.find_element(By.XPATH,'//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
+    driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
+    time.sleep(3)
+    driver.find_element(By.XPATH, '//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
     time.sleep(3)
-    cookie = driver.get_cookies()
-    jar = RequestsCookieJar()
-    for i in cookie:
-        jar.set(i['name'], i['value'])
     driver.get(url)
     time.sleep(3)
-    number_of_posts = 100
-    wait = WebDriverWait(driver, 10)
-    last_height = driver.execute_script(
-        "window.scrollTo(0, document.body.scrollHeight);var scrolldown=document.body.scrollHeight;return scrolldown;")
+    limit_of_posts = 10
+    limit_of_scroll = 100
+    c = 0
+    c_sroll = 0
     link = []
+    
     while True:
-        href_element = wait.until(ec.visibility_of_all_elements_located((By.XPATH, "//a[@href]")))
-        for i in href_element:
-            link.append(i.get_attribute('href'))
-        print(link)
+        print(c, c_sroll)
         driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
         time.sleep(3)
-        new_height = driver.execute_script("return document.body.scrollHeight")
-        driver.implicitly_wait(3)
-        number_of_posts+=1
-        if len(link)>number_of_posts:
-            time.sleep(randint(180,300))
-            number_of_posts == 0
-        if len(link)>10000:
+        html = driver.page_source
+        soup = BeautifulSoup(html, 'lxml')
+        for elem in soup.select('article div div div div a'):
+            print(elem['href'])
+            if elem['href'] not in link:
+                link.append(elem['href'])
+        c += 1
+        c_sroll += 1
+        if c > limit_of_posts:
+            print('中場休息')
+            time.sleep(randint(15, 30))
+            c = 0
+        if c_sroll > limit_of_scroll:
             break
-
+        print(len(link))
     driver.close()
     print(len(link))
-    return link
-
+    with open('link.txt', 'w') as f:
+        f.write(str(link))
+    return link, hashtag
 
 
 def hashtag():
-    hashtag = []
-    c=0
-    for url in download_post():
-        s = Service(path)
-        driver = webdriver.Chrome(service=s)
-        driver.implicitly_wait(3)
-        driver.get(url)
+    input_hashtag = 'pet_cake'
+    collect_hashtag = []
+    c = 0
+    f = open('link.txt', 'r')
+    result = f.read()
+    link = ast.literal_eval(result)
+    s = Service(path)
+    driver = webdriver.Chrome(service=s, options=options)
+    driver.implicitly_wait(3)
+    driver.get('https://www.instagram.com/')
+    time.sleep(5)
+    driver.find_element(By.NAME, 'username').send_keys(sbaccount)
+    driver.find_element(By.NAME, 'password').send_keys(sbpd)
+    time.sleep(3)
+    
+    driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button').click()  # 登入
+    driver.implicitly_wait(3)
+    
+    driver.find_element(By.XPATH, '//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
+    time.sleep(3)
+    
+    for url in link:
+        driver.get(f"https://www.instagram.com/{url}")
         soup = BeautifulSoup(driver.page_source, 'html.parser')
         hashtags = soup.find_all('a', class_='xil3i')
         for tag in hashtags:
             print(tag.text)
-            hashtag.append(tag)
-        c+=1
-        if c>10:
-            time.sleep(3)
-            c=0
+            collect_hashtag.append(tag.text)
+        c += 1
+        if c > 10:
+            time.sleep(randint(5, 10))
+            c = 0
     driver.close()
-
-    db = pymysql.connect(host='localhost',
-                         user='root',
-                         password='jondae350',
-                         database='ig_tags')
-    cur = db.cursor()
-    query = "ALTER TABLE ig_tags ADD taiwan_food TEXT(100)"
-    for i in hashtag:
-        cur.execute('INSERT INTO ig_tags (taiwan_food) VALUES (%s)',i)
-    db.commit()
-    db.close()
+    print(collect_hashtag)
+    db_company = pymysql.connect(host='db.ptt.cx',
+                                 user='choozmo',
+                                 password='pAssw0rd',
+                                 database='seo')
+    cur = db_company.cursor()
+    query_new_col = f"ALTER TABLE seo.ig_pet ADD COLUMN {input_hashtag} VARCHAR(45) NULL"
+    cur.execute(query_new_col)
+    id_number = 0
+    cur.execute("select * from seo.ig_pet order by `index` desc limit 1")
+    last_id = cur.fetchall()[0][0]
+    insert_row = len(collect_hashtag) - last_id
+    for i in range(0, insert_row):
+        query_insert = f"INSERT INTO seo.ig_pet ({input_hashtag}) VALUES ('')"
+        cur.execute(query_insert)
+    for i in collect_hashtag:
+        i = i.replace('#', '')
+        query_update = f"UPDATE seo.ig_pet SET {input_hashtag}='{i}' where `index`='{id_number}'"
+        cur.execute(query_update)
+        id_number += 1
+    
+    db_company.commit()
+    db_company.close()
     return hashtag
 
 

+ 2 - 0
ig_tags/pd.py

@@ -0,0 +1,2 @@
+account = ['chenlinrain','enjoylisteningswift','novak_goodenough']
+password = ['weareyoung12']

+ 13 - 0
ig_tags/setting.py

@@ -0,0 +1,13 @@
+import random
+
+def rua():
+    pool = [
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 OPR/68.0.3618.125",
+    ]
+    return random.choice(pool)

+ 0 - 0
ig_tags/ig_tags.py → ig_tags/tags.py