123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122 |
- import re
- import instaloader
- import dataset
- import codecs
- import sys
- import pprint
- import json
- import time
- import pymysql
- pymysql.install_as_MySQLdb()
- from instaloader import Instaloader, Hashtag
- #db = dataset.connect('sqlite:///:memory:)
- #db = dataset.connect('sqlite:///c:/tmp/ig.db')
- db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4')
- table=db['ig_tags']
- hashtag_regex = re.compile(r"(?:#)(\w(?:(?:\w|(?:\.(?!\.))){0,28}(?:\w))?)")
- def proc_tags(stmt):
- global ses_id
- global query
- metadata = dict(
- hashtags = hashtag_regex.findall(stmt.lower()),
- )
- # print(metadata)``
- for k,v in metadata.items():
- for elmt in v:
- print(elmt)
- table.insert({'kw':elmt,'sid':ses_id,'query':query})
- # print(v)
- # print(metadata)
- def search_hashtag(session, hashtag):
- fw=codecs.open('C:\/Users\/s1301\/Downloads\/tags.txt','w','utf-8')
- jsonData = session.context.get_json(path="explore/tags/" + hashtag + "/", params={"__a": 1})
- fw.write(json.dumps(jsonData))
- fw.close()
-
- top_posts=jsonData['graphql']['hashtag']['edge_hashtag_to_top_posts']
- print(top_posts)
- maxid=jsonData['graphql']['hashtag']['edge_hashtag_to_media']["page_info"]["end_cursor"]
- print(maxid)
- print(maxid)
- for e in top_posts['edges']:
- for e2 in e['node']['edge_media_to_caption']['edges']:
- proc_tags(e2['node']['text'])
-
- for i in range(3):
- time.sleep(5)
- jsonData = session.context.get_json(path="explore/tags/" + hashtag + "/", params={"__a": 1,"max_id":maxid})
- top_posts=jsonData['graphql']['hashtag']['edge_hashtag_to_top_posts']
- maxid=jsonData['graphql']['hashtag']['edge_hashtag_to_media']["page_info"]["end_cursor"]
- print(maxid)
- for e in top_posts['edges']:
- for e2 in e['node']['edge_media_to_caption']['edges']:
- proc_tags(e2['node']['text'])
- hasNextPage = True
- pageNumber = 1
- L = instaloader.Instaloader(user_agent='Mozilla/5.0 (Linux; Android 9; KFONWI Build/PS7326.3183N; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/106.0.5249.170 Safari/537.36 Instagram 236.0.0.20.109 Android (28/9; 213dpi; 800x1216; Amazon; KFONWI; onyx; mt8168; en_US; 371679860)')
- ses_id='198'
- #query='補品'
- #query='滴雞精'
- #query='燉雞湯'
- #query='營養品'
- #query='胺基酸'
- #query='營養師'
- #query='營養補品'
- #query='營養補給'
- #query='粉光蔘'
- #query='調整體質'
- #query='天然漢方'
- #query='考生必備'
- ##query='維他命'
- #query='礦物質'
- #query='西洋蔘'
- #query='補充體力'
- #query='補身'
- #query='營養成分'
- #query='飲食控制'
- query='心焙雞精'
- #query='龜記'
- #query='鶴茶樓'
- #query='初韻'
- #query='醫療廢棄物'
- #query='飲料控'
- cnt=0
- #cursor=db.query("SELECT name FROM sqlite_master WHERE type='table' AND name='tmp'")
- #for c in cursor:
- # cnt+=1
- #if cnt>0:
- cursor=db.query('select query from ig_tags where query = "'+query+'" ')
- cnt=0
- for c in cursor:
- cnt+=1
- if cnt>0:
- sys.exit()
- #if len(cursor)<=0:
- # sys.exit()
- posts = search_hashtag(L, query)
- #posts = search_hashtag(L, "50嵐")
- cursor=db.query('select kw,count(*) as cnt from ig_tags where sid="'+ses_id+'" group by kw order by count(*) desc')
- for c in cursor:
- print(c['kw'])
- print(c['cnt'])
|