|
@@ -0,0 +1,150 @@
|
|
|
+import pandas as pd
|
|
|
+import numpy as np
|
|
|
+from datetime import datetime,timedelta
|
|
|
+import tsfresh.feature_extraction.feature_calculators as tsf
|
|
|
+#找波峰、波風個數
|
|
|
+#用每個時間點去看前後拉一個n長度的區間 ex: 時間是2020-05-01,n=2,表示拉前後2周的資料做檢查是否為波峰
|
|
|
+#波峰邏輯在code內
|
|
|
+def find_local_max(x,n,th=0.5):
|
|
|
+ max_count = 0
|
|
|
+ max_list = []
|
|
|
+ max_values_list = []
|
|
|
+ for i in range(len(x)):
|
|
|
+ before_list = x[:i][-n:]
|
|
|
+ num = x[i]
|
|
|
+ after_list = x[i+1:][:n]
|
|
|
+ other_list = list(before_list) + list(after_list)
|
|
|
+ min_other = min(other_list)
|
|
|
+ min_other = min_other if min_other!=0 else 1
|
|
|
+ max_other = max(other_list)
|
|
|
+ max_other = max_other if max_other!=0 else 1
|
|
|
+ mean_x = np.mean(x)
|
|
|
+ std_x = np.std(x)
|
|
|
+ if num!=0:
|
|
|
+ #波峰邏輯必須同時滿足以下四點
|
|
|
+ #1. (前後區間內最小值/波峰)需大於門檻值th,目前設定th=0.5
|
|
|
+ #2. 前區間所有值都需要小於波峰
|
|
|
+ #3. 後區間所有值都需要小於波峰
|
|
|
+ #4. 波峰需大於群不平均值+兩倍標準差
|
|
|
+ if (min_other/float(num)<1-th) and all(before_list<num) and all(after_list<num) and (num>mean_x+2*std_x):
|
|
|
+ max_count += 1
|
|
|
+ max_list += [i]
|
|
|
+ max_values_list += [num]
|
|
|
+ return max_count,max_list,max_values_list
|
|
|
+
|
|
|
+#生成特徵
|
|
|
+#最後兩波峰斜率(slope) : 如果只有一個波峰就會去尋找n長度的前區間照出最小值,以前區間最小值與波峰價算斜率
|
|
|
+#現在與最後峰斜率(now_slope) : 現在時間點與最後一個波峰的斜率
|
|
|
+#波峰距離現在的時間長度(gap_peak) : 最後一個時間點與最後一個波峰的距離
|
|
|
+#0的比例(rate_0) : 計算整段x中為0的比例
|
|
|
+#近期0的比例(rate_0_now) : 計算x中最後n個數為0的比例
|
|
|
+def gen_feature(x,max_list,n):
|
|
|
+ #計算斜率
|
|
|
+ def get_slope(value,idx):
|
|
|
+ if idx[1]-idx[0]==0:
|
|
|
+ return None
|
|
|
+ else:
|
|
|
+ return (value[1]-value[0])/(idx[1]-idx[0])
|
|
|
+ if len(max_list)==0:
|
|
|
+ return None,None,None,None,None
|
|
|
+ elif len(max_list)==1:
|
|
|
+ i = max_list[0]
|
|
|
+ before_list = x[:i][-n:]
|
|
|
+ if len(before_list)!=0:
|
|
|
+ min_before_idx = np.argmin(before_list)
|
|
|
+ slope_value = [before_list[min_before_idx],x[i]]
|
|
|
+ slope_idx = [min_before_idx,i]
|
|
|
+ slope = get_slope(slope_value,slope_idx)
|
|
|
+ else:
|
|
|
+ slope = 0
|
|
|
+ gap_peak = len(x) - i - 1
|
|
|
+ else:
|
|
|
+ slope_value = [x[max_list[-2]],x[max_list[-1]]]
|
|
|
+ slope_idx = [max_list[-2],max_list[-1]]
|
|
|
+ slope = get_slope(slope_value,slope_idx)
|
|
|
+ gap_peak = len(x) - max_list[-1] - 1
|
|
|
+ rate_0 = sum(x==0)/len(x)
|
|
|
+ rate_0_now = sum(x[-n:]==0)/len(x[-n:])
|
|
|
+ if max_list[-1]==len(x)-1:
|
|
|
+ now_slope = get_slope([x[-2],x[-1]],[len(x)-1,len(x)])
|
|
|
+ else:
|
|
|
+ now_slope = get_slope([x[max_list[-1]],x[-1]],[max_list[-1],len(x)])
|
|
|
+ return slope, now_slope, gap_peak, rate_0, rate_0_now
|
|
|
+
|
|
|
+
|
|
|
+#生成特徵資料
|
|
|
+#最小時間點("min_{}".format(date_nm)) : 計算 key_word_nm 中最小時間點
|
|
|
+#最大時間點("max_{}".format(date_nm)) : 計算 key_word_nm 中最大時間點
|
|
|
+#最後的值("last_{}".format(value_nm)) : 取的最後一天的值
|
|
|
+def gen_feature_df(data,key_word_nm,date_nm,value_nm,n,th):
|
|
|
+ feature_df = pd.DataFrame()
|
|
|
+ for key, analysis_df in data.groupby(key_word_nm):
|
|
|
+ if len(analysis_df)>n+1:
|
|
|
+ analysis_df[date_nm] = [i[:10] for i in analysis_df[date_nm]]
|
|
|
+ max_date = max(analysis_df[date_nm])
|
|
|
+ min_date = min(analysis_df[date_nm])
|
|
|
+ count_analysis_df = len(analysis_df)
|
|
|
+ x = analysis_df[value_nm].values
|
|
|
+ max_count,max_list,max_values_list = find_local_max(x,n,th)
|
|
|
+ slope, now_slope, gap_peak, rate_0, rate_0_now = gen_feature(x,max_list,n)
|
|
|
+ feature_df = feature_df.append({
|
|
|
+ key_word_nm:key,
|
|
|
+ "min_{}".format(date_nm):min_date,
|
|
|
+ "max_{}".format(date_nm):max_date,
|
|
|
+ "count_":count_analysis_df,
|
|
|
+ "slope":slope,
|
|
|
+ "max_count":max_count,
|
|
|
+ "now_slope":now_slope,
|
|
|
+ "gap_peak":gap_peak,
|
|
|
+ "rate_0":rate_0,
|
|
|
+ "rate_0_now":rate_0_now,
|
|
|
+ "last_{}".format(value_nm):x[-1]},ignore_index=True)
|
|
|
+ return feature_df
|
|
|
+
|
|
|
+
|
|
|
+#生成corrcoef
|
|
|
+def gen_corr_set(data,key_word_nm,value_nm,corr_threshold):
|
|
|
+ corr_list = []
|
|
|
+ key_list = []
|
|
|
+ for key,_ in data.groupby(key_word_nm):
|
|
|
+ key_list += [key]
|
|
|
+ corr_list += [data.loc[data[key_word_nm]==key,value_nm].values]
|
|
|
+ x,y = np.where(np.corrcoef(corr_list)>0.7)
|
|
|
+ similar_set = gen_similar_set(x,y)
|
|
|
+ rule_list = list_to_set(similar_set)
|
|
|
+ simulator_nm_list = []
|
|
|
+ for i in rule_list:
|
|
|
+ simulator_nm_list += [[key_list[j] for j in i]]
|
|
|
+ return simulator_nm_list
|
|
|
+
|
|
|
+def list_to_set(similar_set):
|
|
|
+ rule_list = []
|
|
|
+ for rule in similar_set:
|
|
|
+ len_rule = len(rule)
|
|
|
+ break_list = []
|
|
|
+ for i in rule_list:
|
|
|
+ if len(set(i+rule))!=len(i)+len_rule:
|
|
|
+ break_list += [True]
|
|
|
+ else:
|
|
|
+ break_list += [False]
|
|
|
+ if np.sum(break_list)>=1:
|
|
|
+ combine_rule = []
|
|
|
+ re_list = []
|
|
|
+ for j in np.where(break_list)[0]:
|
|
|
+ combine_rule += rule_list[j]
|
|
|
+ re_list += [rule_list[j]]
|
|
|
+ for re_ in re_list:
|
|
|
+ rule_list.remove(re_)
|
|
|
+ combine_rule += rule
|
|
|
+ rule_list += [sorted(list(set(combine_rule)))]
|
|
|
+ else:
|
|
|
+ rule_list += [sorted(rule)]
|
|
|
+ return rule_list
|
|
|
+
|
|
|
+def gen_similar_set(x,y):
|
|
|
+ similar_set = []
|
|
|
+ for i in range(len(x)):
|
|
|
+ if x[i]!=y[i]:
|
|
|
+ similar_set += [[x[i],y[i]]]
|
|
|
+ return similar_set
|
|
|
+
|