|
@@ -25,6 +25,7 @@ sbpd = pd[1]
|
|
options = Options()
|
|
options = Options()
|
|
options.add_argument("--headless")
|
|
options.add_argument("--headless")
|
|
|
|
|
|
|
|
+
|
|
def download_post():
|
|
def download_post():
|
|
hashtag = '寵物零食'
|
|
hashtag = '寵物零食'
|
|
url = f"https://www.instagram.com/explore/tags/{hashtag}/"
|
|
url = f"https://www.instagram.com/explore/tags/{hashtag}/"
|
|
@@ -47,29 +48,39 @@ def download_post():
|
|
driver.get(url)
|
|
driver.get(url)
|
|
time.sleep(3)
|
|
time.sleep(3)
|
|
limit_of_posts = 10
|
|
limit_of_posts = 10
|
|
|
|
+ limit_of_scroll = 250
|
|
c = 0
|
|
c = 0
|
|
|
|
+ c_sroll=0
|
|
wait = WebDriverWait(driver, 10)
|
|
wait = WebDriverWait(driver, 10)
|
|
last_height = driver.execute_script(
|
|
last_height = driver.execute_script(
|
|
"window.scrollTo(0, document.body.scrollHeight);var scrolldown=document.body.scrollHeight;return scrolldown;")
|
|
"window.scrollTo(0, document.body.scrollHeight);var scrolldown=document.body.scrollHeight;return scrolldown;")
|
|
link = []
|
|
link = []
|
|
while True:
|
|
while True:
|
|
- print(c)
|
|
|
|
- href_element = wait.until(ec.visibility_of_all_elements_located((By.XPATH, "//a[@href]")))
|
|
|
|
- for i in href_element:
|
|
|
|
- link.append(i.get_attribute('href'))
|
|
|
|
- print(link)
|
|
|
|
|
|
+ print(c, c_sroll)
|
|
|
|
+
|
|
|
|
+ # href_element = wait.until(ec.visibility_of_all_elements_located((By.XPATH, "//a[@href]")))
|
|
|
|
+ # for i in href_element:
|
|
|
|
+ # link.append(i.get_attribute('href'))
|
|
|
|
+ # print(link)
|
|
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
|
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
|
time.sleep(3)
|
|
time.sleep(3)
|
|
- new_height = driver.execute_script("return document.body.scrollHeight")
|
|
|
|
- driver.implicitly_wait(3)
|
|
|
|
|
|
+ # new_height = driver.execute_script("return document.body.scrollHeight")
|
|
|
|
+ # driver.implicitly_wait(3)
|
|
c+=1
|
|
c+=1
|
|
|
|
+ c_sroll+=1
|
|
if c > limit_of_posts:
|
|
if c > limit_of_posts:
|
|
print('中場休息')
|
|
print('中場休息')
|
|
- time.sleep(randint(180,300))
|
|
|
|
|
|
+ time.sleep(randint(30,60))
|
|
c=0
|
|
c=0
|
|
- if len(link)>5000:
|
|
|
|
|
|
+ # if len(link)>500:
|
|
|
|
+ # break
|
|
|
|
+ if c_sroll>limit_of_scroll:
|
|
break
|
|
break
|
|
-
|
|
|
|
|
|
+ html = driver.page_source
|
|
|
|
+ soup = BeautifulSoup(html, 'lxml')
|
|
|
|
+ for elem in soup.select('article div div div div a'):
|
|
|
|
+ if elem['href'] not in link:
|
|
|
|
+ link.append(elem['href'])
|
|
driver.close()
|
|
driver.close()
|
|
print(len(link))
|
|
print(len(link))
|
|
return link
|
|
return link
|
|
@@ -80,16 +91,25 @@ def hashtag():
|
|
hashtag = []
|
|
hashtag = []
|
|
c=0
|
|
c=0
|
|
result = download_post()
|
|
result = download_post()
|
|
|
|
+ s = Service(path)
|
|
|
|
+ driver = webdriver.Chrome(service=s)
|
|
|
|
+ driver.implicitly_wait(3)
|
|
|
|
+ driver.get('https://www.instagram.com/')
|
|
|
|
+ time.sleep(5)
|
|
|
|
+ driver.find_element(By.NAME,'username').send_keys(sbaccount)
|
|
|
|
+ driver.find_element(By.NAME,'password').send_keys(sbpd)
|
|
|
|
+ time.sleep(3)
|
|
|
|
+ driver.find_element(By.XPATH,'//*[@id="loginForm"]/div/div[3]/button').click() # 登入
|
|
|
|
+ driver.implicitly_wait(3)
|
|
|
|
+ driver.find_element(By.XPATH,'//*[@id="react-root"]/section/main/div/div/div/section/div/button').click()
|
|
|
|
+ time.sleep(3)
|
|
for url in result:
|
|
for url in result:
|
|
- s = Service(path)
|
|
|
|
- driver = webdriver.Chrome(service=s)
|
|
|
|
- driver.implicitly_wait(3)
|
|
|
|
- driver.get(url)
|
|
|
|
|
|
+ driver.get(f"https://www.instagram.com/{url}")
|
|
soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
hashtags = soup.find_all('a', class_='xil3i')
|
|
hashtags = soup.find_all('a', class_='xil3i')
|
|
for tag in hashtags:
|
|
for tag in hashtags:
|
|
print(tag.text)
|
|
print(tag.text)
|
|
- hashtag.append(tag)
|
|
|
|
|
|
+ hashtag.append(tag.text)
|
|
c+=1
|
|
c+=1
|
|
if c>10:
|
|
if c>10:
|
|
time.sleep(3)
|
|
time.sleep(3)
|
|
@@ -101,9 +121,11 @@ def hashtag():
|
|
password='jondae350',
|
|
password='jondae350',
|
|
database='ig_tags')
|
|
database='ig_tags')
|
|
cur = db.cursor()
|
|
cur = db.cursor()
|
|
- query = "ALTER TABLE ig_tags ADD taiwan_food TEXT(100)"
|
|
|
|
|
|
+ query = "ALTER TABLE ig_tags ADD health_product TEXT(100)"
|
|
|
|
+ id_number = 1
|
|
for i in hashtag:
|
|
for i in hashtag:
|
|
- cur.execute('INSERT INTO ig_tags (taiwan_food) VALUES (%s)',i)
|
|
|
|
|
|
+ cur.execute(f'UPDATE ig_tags set snack={i} where id={id_number}')
|
|
|
|
+ id_number+=1
|
|
db.commit()
|
|
db.commit()
|
|
db.close()
|
|
db.close()
|
|
return hashtag
|
|
return hashtag
|