|
@@ -12,7 +12,9 @@ from selenium.webdriver.common.by import By
|
|
|
from selenium.webdriver.support.wait import WebDriverWait
|
|
|
from selenium.webdriver.support import expected_conditions as ec
|
|
|
# import dataset
|
|
|
+import ast
|
|
|
import pymysql
|
|
|
+
|
|
|
pymysql.install_as_MySQLdb()
|
|
|
|
|
|
import pd
|
|
@@ -21,98 +23,99 @@ import setting
|
|
|
|
|
|
account = ['chenlinrain','enjoylisteningswift','novak_goodenough']
|
|
|
|
|
|
-password = ['weareyoung12']
|
|
|
+account = pd.account
|
|
|
+password = pd.password
|
|
|
|
|
|
-path = "C:\\Users\\user\\Downloads\\chromedriver_win32"
|
|
|
-sbaccount = account[1]
|
|
|
+path = '/Users/zooeytsai/Downloads/chromedriver'
|
|
|
+sbaccount = account[2]
|
|
|
sbpd = password[1]
|
|
|
options = webdriver.ChromeOptions()
|
|
|
# options.add_argument("--headless") # 視窗背後執行
|
|
|
-options.add_argument("user-agent=%s" % setting.rua())
|
|
|
+options.add_argument("user-agent=%s" % rua())
|
|
|
|
|
|
|
|
|
def download_post():
|
|
|
- hashtag = '寵物鮮食'
|
|
|
+ hashtag = '寵物蛋糕'
|
|
|
url = f"https://www.instagram.com/explore/tags/{hashtag}/"
|
|
|
s = Service(path)
|
|
|
driver = webdriver.Chrome(service=s, options=options)
|
|
|
driver.implicitly_wait(3)
|
|
|
driver.get('https://www.instagram.com/')
|
|
|
time.sleep(5)
|
|
|
- driver.find_element(By.NAME,'username').send_keys(sbaccount)
|
|
|
- driver.find_element(By.NAME,'password').send_keys(sbpd)
|
|
|
+ driver.find_element(By.NAME, 'username').send_keys(sbaccount)
|
|
|
+ driver.find_element(By.NAME, 'password').send_keys(sbpd)
|
|
|
time.sleep(3)
|
|
|
- driver.find_element(By.XPATH,'//*[@id="loginForm"]/div/div[3]/button').click() # 登入
|
|
|
- driver.implicitly_wait(3)
|
|
|
- driver.find_element(By.XPATH,'//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
|
|
|
+ driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button').click() # 登入
|
|
|
+ time.sleep(3)
|
|
|
+ driver.find_element(By.XPATH, '//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
|
|
|
time.sleep(3)
|
|
|
driver.get(url)
|
|
|
time.sleep(3)
|
|
|
- limit_of_posts = 1
|
|
|
- limit_of_scroll = 3
|
|
|
+ limit_of_posts = 10
|
|
|
+ limit_of_scroll = 100
|
|
|
c = 0
|
|
|
- c_sroll=0
|
|
|
- wait = WebDriverWait(driver, 10)
|
|
|
- last_height = driver.execute_script(
|
|
|
- "window.scrollTo(0, document.body.scrollHeight);var scrolldown=document.body.scrollHeight;return scrolldown;")
|
|
|
+ c_sroll = 0
|
|
|
link = []
|
|
|
+
|
|
|
while True:
|
|
|
print(c, c_sroll)
|
|
|
-
|
|
|
- # href_element = wait.until(ec.visibility_of_all_elements_located((By.XPATH, "//a[@href]")))
|
|
|
- # for i in href_element:
|
|
|
- # link.append(i.get_attribute('href'))
|
|
|
- # print(link)
|
|
|
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
|
|
time.sleep(3)
|
|
|
- # new_height = driver.execute_script("return document.body.scrollHeight")
|
|
|
- # driver.implicitly_wait(3)
|
|
|
- c+=1
|
|
|
- c_sroll+=1
|
|
|
+ html = driver.page_source
|
|
|
+ soup = BeautifulSoup(html, 'lxml')
|
|
|
+ for elem in soup.select('article div div div div a'):
|
|
|
+ print(elem['href'])
|
|
|
+ if elem['href'] not in link:
|
|
|
+ link.append(elem['href'])
|
|
|
+ c += 1
|
|
|
+ c_sroll += 1
|
|
|
if c > limit_of_posts:
|
|
|
print('中場休息')
|
|
|
- time.sleep(randint(15,30))
|
|
|
- c=0
|
|
|
- if c_sroll>limit_of_scroll:
|
|
|
+ time.sleep(randint(15, 30))
|
|
|
+ c = 0
|
|
|
+ if c_sroll > limit_of_scroll:
|
|
|
break
|
|
|
- html = driver.page_source
|
|
|
- soup = BeautifulSoup(html, 'lxml')
|
|
|
- for elem in soup.select('article div div div div a'):
|
|
|
- if elem['href'] not in link:
|
|
|
- link.append(elem['href'])
|
|
|
+ print(len(link))
|
|
|
driver.close()
|
|
|
print(len(link))
|
|
|
+ with open('link.txt', 'w') as f:
|
|
|
+ f.write(str(link))
|
|
|
return link, hashtag
|
|
|
|
|
|
|
|
|
def hashtag():
|
|
|
input_hashtag = 'pet_cake'
|
|
|
collect_hashtag = []
|
|
|
- c=0
|
|
|
- result = download_post()[0]
|
|
|
+ c = 0
|
|
|
+ f = open('link.txt', 'r')
|
|
|
+ result = f.read()
|
|
|
+ link = ast.literal_eval(result)
|
|
|
s = Service(path)
|
|
|
- driver = webdriver.Chrome(service=s,options=options)
|
|
|
+ driver = webdriver.Chrome(service=s, options=options)
|
|
|
driver.implicitly_wait(3)
|
|
|
driver.get('https://www.instagram.com/')
|
|
|
time.sleep(5)
|
|
|
- driver.find_element(By.NAME,'username').send_keys(sbaccount)
|
|
|
- driver.find_element(By.NAME,'password').send_keys(sbpd)
|
|
|
+ driver.find_element(By.NAME, 'username').send_keys(sbaccount)
|
|
|
+ driver.find_element(By.NAME, 'password').send_keys(sbpd)
|
|
|
time.sleep(3)
|
|
|
- driver.find_element(By.XPATH,'//*[@id="loginForm"]/div/div[3]/button').click() # 登入
|
|
|
+
|
|
|
+ driver.find_element(By.XPATH, '//*[@id="loginForm"]/div/div[3]/button').click() # 登入
|
|
|
driver.implicitly_wait(3)
|
|
|
- driver.find_element(By.XPATH,'//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
|
|
|
+
|
|
|
+ driver.find_element(By.XPATH, '//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
|
|
|
time.sleep(3)
|
|
|
- for url in result:
|
|
|
+
|
|
|
+ for url in link:
|
|
|
driver.get(f"https://www.instagram.com/{url}")
|
|
|
soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
hashtags = soup.find_all('a', class_='xil3i')
|
|
|
for tag in hashtags:
|
|
|
print(tag.text)
|
|
|
collect_hashtag.append(tag.text)
|
|
|
- c+=1
|
|
|
- if c>10:
|
|
|
- time.sleep(randint(5,10))
|
|
|
- c=0
|
|
|
+ c += 1
|
|
|
+ if c > 10:
|
|
|
+ time.sleep(randint(5, 10))
|
|
|
+ c = 0
|
|
|
driver.close()
|
|
|
print(collect_hashtag)
|
|
|
db_company = pymysql.connect(host='db.ptt.cx',
|
|
@@ -125,7 +128,7 @@ def hashtag():
|
|
|
id_number = 0
|
|
|
cur.execute("select * from seo.ig_pet order by `index` desc limit 1")
|
|
|
last_id = cur.fetchall()[0][0]
|
|
|
- insert_row = len(collect_hashtag)-last_id
|
|
|
+ insert_row = len(collect_hashtag) - last_id
|
|
|
for i in range(0, insert_row):
|
|
|
query_insert = f"INSERT INTO seo.ig_pet ({input_hashtag}) VALUES ('')"
|
|
|
cur.execute(query_insert)
|
|
@@ -134,11 +137,11 @@ def hashtag():
|
|
|
query_update = f"UPDATE seo.ig_pet SET {input_hashtag}='{i}' where `index`='{id_number}'"
|
|
|
cur.execute(query_update)
|
|
|
id_number += 1
|
|
|
-
|
|
|
+
|
|
|
db_company.commit()
|
|
|
db_company.close()
|
|
|
return hashtag
|
|
|
|
|
|
|
|
|
-
|
|
|
+download_post()
|
|
|
hashtag()
|