import re import instaloader import dataset import codecs import sys import pprint import json import time import pymysql pymysql.install_as_MySQLdb() from instaloader import Instaloader, Hashtag #db = dataset.connect('sqlite:///:memory:) #db = dataset.connect('sqlite:///c:/tmp/ig.db') db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/seo?charset=utf8mb4') table=db['ig_tags'] hashtag_regex = re.compile(r"(?:#)(\w(?:(?:\w|(?:\.(?!\.))){0,28}(?:\w))?)") def proc_tags(stmt): global ses_id global query metadata = dict( hashtags = hashtag_regex.findall(stmt.lower()), ) # print(metadata)`` for k,v in metadata.items(): for elmt in v: print(elmt) table.insert({'kw':elmt,'sid':ses_id,'query':query}) # print(v) # print(metadata) def search_hashtag(session, hashtag): fw=codecs.open('C:\/Users\/s1301\/Downloads\/tags.txt','w','utf-8') jsonData = session.context.get_json(path="explore/tags/" + hashtag + "/", params={"__a": 1}) fw.write(json.dumps(jsonData)) fw.close() top_posts=jsonData['graphql']['hashtag']['edge_hashtag_to_top_posts'] print(top_posts) maxid=jsonData['graphql']['hashtag']['edge_hashtag_to_media']["page_info"]["end_cursor"] print(maxid) print(maxid) for e in top_posts['edges']: for e2 in e['node']['edge_media_to_caption']['edges']: proc_tags(e2['node']['text']) for i in range(3): time.sleep(5) jsonData = session.context.get_json(path="explore/tags/" + hashtag + "/", params={"__a": 1,"max_id":maxid}) top_posts=jsonData['graphql']['hashtag']['edge_hashtag_to_top_posts'] maxid=jsonData['graphql']['hashtag']['edge_hashtag_to_media']["page_info"]["end_cursor"] print(maxid) for e in top_posts['edges']: for e2 in e['node']['edge_media_to_caption']['edges']: proc_tags(e2['node']['text']) hasNextPage = True pageNumber = 1 L = instaloader.Instaloader(user_agent='Mozilla/5.0 (Linux; Android 9; KFONWI Build/PS7326.3183N; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/106.0.5249.170 Safari/537.36 Instagram 236.0.0.20.109 Android (28/9; 213dpi; 800x1216; Amazon; KFONWI; onyx; mt8168; en_US; 371679860)') ses_id='198' #query='補品' #query='滴雞精' #query='燉雞湯' #query='營養品' #query='胺基酸' #query='營養師' #query='營養補品' #query='營養補給' #query='粉光蔘' #query='調整體質' #query='天然漢方' #query='考生必備' ##query='維他命' #query='礦物質' #query='西洋蔘' #query='補充體力' #query='補身' #query='營養成分' #query='飲食控制' query='心焙雞精' #query='龜記' #query='鶴茶樓' #query='初韻' #query='醫療廢棄物' #query='飲料控' cnt=0 #cursor=db.query("SELECT name FROM sqlite_master WHERE type='table' AND name='tmp'") #for c in cursor: # cnt+=1 #if cnt>0: cursor=db.query('select query from ig_tags where query = "'+query+'" ') cnt=0 for c in cursor: cnt+=1 if cnt>0: sys.exit() #if len(cursor)<=0: # sys.exit() posts = search_hashtag(L, query) #posts = search_hashtag(L, "50嵐") cursor=db.query('select kw,count(*) as cnt from ig_tags where sid="'+ses_id+'" group by kw order by count(*) desc') for c in cursor: print(c['kw']) print(c['cnt'])