4 年之前 · e85bf1cda8
--- a/exploding_topi/exploding_topic_daily.py
+++ b/exploding_topi/exploding_topic_daily.py
@@ -1,37 +0,0 @@
 
				-import pandas as pd 
			
 
				-from util import *
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__'
			
 
				-    #讀取資料 'iday_kword','iday_date','iday_monthly'
			
 
				-    data = pd.read_csv('data_daily.csv').loc[:,['iday_kword','iday_date','iday_monthly']]
			
 
				-
			
 
				-    #參數n為前後區間拉開的長度，越小越靈敏(多波峰)，用大越遲鈍(少波峰)
			
 
				-    #th為(前後區間內最小值/波峰)需大於門檻值th，越小越靈敏(多波峰)，用大越遲鈍(少波峰)
			
 
				-    n = 30
			
 
				-    th = 0.5
			
 
				-    key_word_nm = 'iday_kword'
			
 
				-    date_nm='iday_date'
			
 
				-    value_nm='iday_monthly'
			
 
				-    feature_df = gen_feature_df(data,key_word_nm,date_nm,value_nm,n,th)
			
 
				-
			
 
				-    ###exploding_topic_list即為爆紅關鍵字
			
 
				-    #利用feature_df的特徵可以找出現在正在紅的關鍵字
			
 
				-    #以下2個rule
			
 
				-    #1.近peak_day_lag=30天有發生peak而且現在與最後峰斜率(now_slope)大於
			
 
				-    #2.現在就是peak的點(gap_peak==0)
			
 
				-    peak_day_lag = 30
			
 
				-    now_slope_threshold = -2
			
 
				-    exploding_topic_list = feature_df.loc[((feature_df.gap_peak<peak_day_lag)&(feature_df.now_slope>now_slope_threshold)) | \
			
 
				-                                           (feature_df.gap_peak==0),key_word_nm].tolist()
			
 
				-
			
 
				-    ###similar_set相關性高的關鍵字
			
 
				-    #利用rate_0、rate_0_now篩選0太多的資料
			
 
				-    rate_0_threshold = 0.5
			
 
				-    corr_threshold = 0.7
			
 
				-    analysis_day = ['2021-05-01','2021-06-07']
			
 
				-    analysis_data = feature_df.loc[(feature_df.rate_0<rate_0_threshold)&(feature_df.rate_0_now<rate_0_threshold)]
			
 
				-    analysis_data = pd.merge(analysis_data,data,on=[key_word_nm])
			
 
				-    analysis_data = analysis_data.loc[(analysis_data[date_nm]>=analysis_day[0])&(analysis_data[date_nm]<analysis_day[1])]
			
 
				-    #similar_set中每一組list都是一組相關性高的關鍵字
			
 
				-    similar_set = gen_corr_set(analysis_data,key_word_nm,value_nm,corr_threshold)
			
--- a/exploding_topi/util.py
+++ b/exploding_topi/util.py
@@ -1,150 +0,0 @@
 
				-import pandas as pd 
			
 
				-import numpy as np
			
 
				-from  datetime import datetime,timedelta
			
 
				-import tsfresh.feature_extraction.feature_calculators as tsf 
			
 
				-#找波峰、波風個數
			
 
				-#用每個時間點去看前後拉一個n長度的區間   ex: 時間是2020-05-01，n=2，表示拉前後2周的資料做檢查是否為波峰
			
 
				-#波峰邏輯在code內
			
 
				-def find_local_max(x,n,th=0.5):
			
 
				-    max_count = 0
			
 
				-    max_list = []
			
 
				-    max_values_list = []
			
 
				-    for i in range(len(x)):
			
 
				-        before_list = x[:i][-n:]
			
 
				-        num = x[i]
			
 
				-        after_list = x[i+1:][:n]
			
 
				-        other_list = list(before_list) + list(after_list)
			
 
				-        min_other = min(other_list)
			
 
				-        min_other = min_other if min_other!=0 else 1
			
 
				-        max_other = max(other_list)
			
 
				-        max_other = max_other if max_other!=0 else 1
			
 
				-        mean_x = np.mean(x)
			
 
				-        std_x = np.std(x)
			
 
				-        if num!=0:
			
 
				-            #波峰邏輯必須同時滿足以下四點
			
 
				-            #1. (前後區間內最小值/波峰)需大於門檻值th，目前設定th=0.5
			
 
				-            #2. 前區間所有值都需要小於波峰
			
 
				-            #3. 後區間所有值都需要小於波峰
			
 
				-            #4. 波峰需大於群不平均值+兩倍標準差
			
 
				-            if (min_other/float(num)<1-th) and all(before_list<num) and all(after_list<num) and (num>mean_x+2*std_x):
			
 
				-                max_count += 1
			
 
				-                max_list += [i]
			
 
				-                max_values_list += [num]        
			
 
				-    return max_count,max_list,max_values_list
			
 
				-
			
 
				-#生成特徵
			
 
				-#最後兩波峰斜率(slope) : 如果只有一個波峰就會去尋找n長度的前區間照出最小值，以前區間最小值與波峰價算斜率  
			
 
				-#現在與最後峰斜率(now_slope) : 現在時間點與最後一個波峰的斜率
			
 
				-#波峰距離現在的時間長度(gap_peak) : 最後一個時間點與最後一個波峰的距離
			
 
				-#0的比例(rate_0) : 計算整段x中為0的比例
			
 
				-#近期0的比例(rate_0_now) : 計算x中最後n個數為0的比例
			
 
				-def gen_feature(x,max_list,n):
			
 
				-    #計算斜率
			
 
				-    def get_slope(value,idx):
			
 
				-        if idx[1]-idx[0]==0:
			
 
				-            return None
			
 
				-        else:
			
 
				-            return (value[1]-value[0])/(idx[1]-idx[0])
			
 
				-    if len(max_list)==0:
			
 
				-        return None,None,None,None,None
			
 
				-    elif len(max_list)==1:
			
 
				-        i = max_list[0]
			
 
				-        before_list = x[:i][-n:]
			
 
				-        if len(before_list)!=0:
			
 
				-            min_before_idx = np.argmin(before_list)
			
 
				-            slope_value = [before_list[min_before_idx],x[i]]
			
 
				-            slope_idx = [min_before_idx,i]
			
 
				-            slope = get_slope(slope_value,slope_idx)
			
 
				-        else:
			
 
				-            slope = 0
			
 
				-        gap_peak = len(x) - i - 1
			
 
				-    else:
			
 
				-        slope_value = [x[max_list[-2]],x[max_list[-1]]]
			
 
				-        slope_idx = [max_list[-2],max_list[-1]]
			
 
				-        slope = get_slope(slope_value,slope_idx)
			
 
				-        gap_peak = len(x) - max_list[-1] - 1
			
 
				-    rate_0 = sum(x==0)/len(x)
			
 
				-    rate_0_now = sum(x[-n:]==0)/len(x[-n:])
			
 
				-    if max_list[-1]==len(x)-1:
			
 
				-        now_slope = get_slope([x[-2],x[-1]],[len(x)-1,len(x)])
			
 
				-    else:
			
 
				-        now_slope = get_slope([x[max_list[-1]],x[-1]],[max_list[-1],len(x)])
			
 
				-    return slope, now_slope, gap_peak, rate_0, rate_0_now
			
 
				-
			
 
				-
			
 
				-#生成特徵資料
			
 
				-#最小時間點("min_{}".format(date_nm)) : 計算 key_word_nm 中最小時間點
			
 
				-#最大時間點("max_{}".format(date_nm)) : 計算 key_word_nm 中最大時間點
			
 
				-#最後的值("last_{}".format(value_nm)) : 取的最後一天的值
			
 
				-def gen_feature_df(data,key_word_nm,date_nm,value_nm,n,th):
			
 
				-    feature_df = pd.DataFrame()
			
 
				-    for key, analysis_df in data.groupby(key_word_nm):
			
 
				-        if len(analysis_df)>n+1:
			
 
				-            analysis_df[date_nm] = [i[:10] for i in analysis_df[date_nm]]
			
 
				-            max_date = max(analysis_df[date_nm])
			
 
				-            min_date = min(analysis_df[date_nm])
			
 
				-            count_analysis_df = len(analysis_df)
			
 
				-            x = analysis_df[value_nm].values
			
 
				-            max_count,max_list,max_values_list = find_local_max(x,n,th)
			
 
				-            slope, now_slope, gap_peak, rate_0, rate_0_now = gen_feature(x,max_list,n)
			
 
				-            feature_df = feature_df.append({
			
 
				-                key_word_nm:key,
			
 
				-                "min_{}".format(date_nm):min_date,
			
 
				-                "max_{}".format(date_nm):max_date,
			
 
				-                "count_":count_analysis_df,
			
 
				-                "slope":slope,
			
 
				-                "max_count":max_count,
			
 
				-                "now_slope":now_slope,
			
 
				-                "gap_peak":gap_peak,
			
 
				-                "rate_0":rate_0,
			
 
				-                "rate_0_now":rate_0_now,
			
 
				-                "last_{}".format(value_nm):x[-1]},ignore_index=True)
			
 
				-    return feature_df
			
 
				-
			
 
				-
			
 
				-#生成corrcoef
			
 
				-def gen_corr_set(data,key_word_nm,value_nm,corr_threshold):
			
 
				-    corr_list = []
			
 
				-    key_list = []
			
 
				-    for key,_ in data.groupby(key_word_nm):
			
 
				-        key_list += [key]
			
 
				-        corr_list += [data.loc[data[key_word_nm]==key,value_nm].values]
			
 
				-    x,y = np.where(np.corrcoef(corr_list)>0.7)
			
 
				-    similar_set = gen_similar_set(x,y)
			
 
				-    rule_list = list_to_set(similar_set)
			
 
				-    simulator_nm_list = []        
			
 
				-    for i in rule_list:
			
 
				-        simulator_nm_list += [[key_list[j] for j in i]]
			
 
				-    return simulator_nm_list
			
 
				-
			
 
				-def list_to_set(similar_set):
			
 
				-    rule_list = []
			
 
				-    for rule in similar_set:
			
 
				-        len_rule = len(rule)
			
 
				-        break_list = []
			
 
				-        for i in rule_list:
			
 
				-            if len(set(i+rule))!=len(i)+len_rule:
			
 
				-                break_list += [True]
			
 
				-            else:
			
 
				-                break_list += [False]
			
 
				-        if np.sum(break_list)>=1:
			
 
				-            combine_rule = []
			
 
				-            re_list = []
			
 
				-            for j in np.where(break_list)[0]:
			
 
				-                combine_rule += rule_list[j]
			
 
				-                re_list += [rule_list[j]]
			
 
				-            for re_ in re_list:
			
 
				-                rule_list.remove(re_)
			
 
				-            combine_rule += rule
			
 
				-            rule_list += [sorted(list(set(combine_rule)))]
			
 
				-        else:
			
 
				-            rule_list += [sorted(rule)]
			
 
				-    return rule_list
			
 
				-
			
 
				-def gen_similar_set(x,y):
			
 
				-    similar_set = []
			
 
				-    for i in range(len(x)):
			
 
				-        if x[i]!=y[i]:
			
 
				-            similar_set += [[x[i],y[i]]]
			
 
				-    return similar_set
			
 
				-