| 
					
				 | 
			
			
				@@ -0,0 +1,150 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import pandas as pd  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import numpy as np 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from  datetime import datetime,timedelta 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import tsfresh.feature_extraction.feature_calculators as tsf  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#找波峰、波風個數 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#用每個時間點去看前後拉一個n長度的區間   ex: 時間是2020-05-01,n=2,表示拉前後2周的資料做檢查是否為波峰 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#波峰邏輯在code內 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def find_local_max(x,n,th=0.5): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    max_count = 0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    max_list = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    max_values_list = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for i in range(len(x)): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        before_list = x[:i][-n:] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        num = x[i] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        after_list = x[i+1:][:n] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        other_list = list(before_list) + list(after_list) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        min_other = min(other_list) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        min_other = min_other if min_other!=0 else 1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        max_other = max(other_list) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        max_other = max_other if max_other!=0 else 1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        mean_x = np.mean(x) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        std_x = np.std(x) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if num!=0: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            #波峰邏輯必須同時滿足以下四點 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            #1. (前後區間內最小值/波峰)需大於門檻值th,目前設定th=0.5 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            #2. 前區間所有值都需要小於波峰 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            #3. 後區間所有值都需要小於波峰 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            #4. 波峰需大於群不平均值+兩倍標準差 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if (min_other/float(num)<1-th) and all(before_list<num) and all(after_list<num) and (num>mean_x+2*std_x): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                max_count += 1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                max_list += [i] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                max_values_list += [num]         
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return max_count,max_list,max_values_list 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#生成特徵 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#最後兩波峰斜率(slope) : 如果只有一個波峰就會去尋找n長度的前區間照出最小值,以前區間最小值與波峰價算斜率   
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#現在與最後峰斜率(now_slope) : 現在時間點與最後一個波峰的斜率 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#波峰距離現在的時間長度(gap_peak) : 最後一個時間點與最後一個波峰的距離 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#0的比例(rate_0) : 計算整段x中為0的比例 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#近期0的比例(rate_0_now) : 計算x中最後n個數為0的比例 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def gen_feature(x,max_list,n): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    #計算斜率 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    def get_slope(value,idx): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if idx[1]-idx[0]==0: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            return None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            return (value[1]-value[0])/(idx[1]-idx[0]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if len(max_list)==0: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        return None,None,None,None,None 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    elif len(max_list)==1: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        i = max_list[0] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        before_list = x[:i][-n:] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if len(before_list)!=0: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            min_before_idx = np.argmin(before_list) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            slope_value = [before_list[min_before_idx],x[i]] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            slope_idx = [min_before_idx,i] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            slope = get_slope(slope_value,slope_idx) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            slope = 0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        gap_peak = len(x) - i - 1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        slope_value = [x[max_list[-2]],x[max_list[-1]]] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        slope_idx = [max_list[-2],max_list[-1]] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        slope = get_slope(slope_value,slope_idx) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        gap_peak = len(x) - max_list[-1] - 1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    rate_0 = sum(x==0)/len(x) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    rate_0_now = sum(x[-n:]==0)/len(x[-n:]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if max_list[-1]==len(x)-1: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        now_slope = get_slope([x[-2],x[-1]],[len(x)-1,len(x)]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        now_slope = get_slope([x[max_list[-1]],x[-1]],[max_list[-1],len(x)]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return slope, now_slope, gap_peak, rate_0, rate_0_now 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#生成特徵資料 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#最小時間點("min_{}".format(date_nm)) : 計算 key_word_nm 中最小時間點 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#最大時間點("max_{}".format(date_nm)) : 計算 key_word_nm 中最大時間點 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#最後的值("last_{}".format(value_nm)) : 取的最後一天的值 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def gen_feature_df(data,key_word_nm,date_nm,value_nm,n,th): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    feature_df = pd.DataFrame() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for key, analysis_df in data.groupby(key_word_nm): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if len(analysis_df)>n+1: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            analysis_df[date_nm] = [i[:10] for i in analysis_df[date_nm]] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            max_date = max(analysis_df[date_nm]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            min_date = min(analysis_df[date_nm]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            count_analysis_df = len(analysis_df) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            x = analysis_df[value_nm].values 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            max_count,max_list,max_values_list = find_local_max(x,n,th) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            slope, now_slope, gap_peak, rate_0, rate_0_now = gen_feature(x,max_list,n) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            feature_df = feature_df.append({ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                key_word_nm:key, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                "min_{}".format(date_nm):min_date, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                "max_{}".format(date_nm):max_date, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                "count_":count_analysis_df, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                "slope":slope, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                "max_count":max_count, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                "now_slope":now_slope, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                "gap_peak":gap_peak, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                "rate_0":rate_0, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                "rate_0_now":rate_0_now, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                "last_{}".format(value_nm):x[-1]},ignore_index=True) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return feature_df 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#生成corrcoef 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def gen_corr_set(data,key_word_nm,value_nm,corr_threshold): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    corr_list = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    key_list = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for key,_ in data.groupby(key_word_nm): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        key_list += [key] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        corr_list += [data.loc[data[key_word_nm]==key,value_nm].values] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    x,y = np.where(np.corrcoef(corr_list)>0.7) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    similar_set = gen_similar_set(x,y) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    rule_list = list_to_set(similar_set) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    simulator_nm_list = []         
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for i in rule_list: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        simulator_nm_list += [[key_list[j] for j in i]] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return simulator_nm_list 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def list_to_set(similar_set): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    rule_list = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for rule in similar_set: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        len_rule = len(rule) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        break_list = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        for i in rule_list: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if len(set(i+rule))!=len(i)+len_rule: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                break_list += [True] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                break_list += [False] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if np.sum(break_list)>=1: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            combine_rule = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            re_list = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            for j in np.where(break_list)[0]: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                combine_rule += rule_list[j] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                re_list += [rule_list[j]] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            for re_ in re_list: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                rule_list.remove(re_) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            combine_rule += rule 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            rule_list += [sorted(list(set(combine_rule)))] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            rule_list += [sorted(rule)] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return rule_list 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def gen_similar_set(x,y): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    similar_set = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for i in range(len(x)): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if x[i]!=y[i]: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            similar_set += [[x[i],y[i]]] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return similar_set 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 |