data.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463
  1. from fastapi import APIRouter, Form, Depends, HTTPException, File, UploadFile,Request
  2. from fastapi.responses import FileResponse, StreamingResponse
  3. from fastapi_login.exceptions import InvalidCredentialsException
  4. from fastapi_login import LoginManager
  5. from fastapi.responses import HTMLResponse
  6. from typing import List,Optional
  7. import pymysql
  8. import numpy as np
  9. import math
  10. import re
  11. import pandas as pd
  12. import ast
  13. import csv
  14. import matplotlib.pyplot as plt
  15. from matplotlib.ticker import MaxNLocator
  16. import matplotlib
  17. import io
  18. matplotlib.rc('font', family='STSong')
  19. plt.rcParams['font.sans-serif']=['SimHei']
  20. plt.rcParams['axes.unicode_minus']=False
  21. pd.set_option('display.max_columns', None)
  22. data = APIRouter()
  23. #USER_INFO_FILE = '/var/www/ntcri_api/app/api/user_information_change.csv'
  24. #ATTEND_RECORD_FILE = '/var/www/ntcri_api/app/api/attend_record_syn.csv'
  25. #CLASS_DETAIL_FILE = '/var/www/ntcri_api/app/api/class_detail.csv'
  26. #CLASS_LIST_FILE = '/var/www/ntcri_api/app/api/class_list.csv'
  27. #CLASS_NAME_FILE = '/var/www/ntcri_api/app/api/class_name1.csv'
  28. #SCHOOL_FILE = '/var/www/ntcri_api/app/api/school.csv'
  29. def call_sql(sql_query):
  30. connection = pymysql.connect(
  31. host='db.ptt.cx',
  32. user='choozmo',
  33. password='pAssw0rd',
  34. database='test'
  35. )
  36. cursor = connection.cursor()
  37. cursor.execute(sql_query)
  38. results = cursor.fetchall()
  39. df = pd.DataFrame(results, columns=[i[0] for i in cursor.description])
  40. cursor.close()
  41. connection.close()
  42. return df
  43. def match():
  44. # based on 'registration'
  45. sql_query = """
  46. SELECT
  47. `user_information`.`id` AS `user_information_id`, `birthday`, `gender`, `position`,
  48. `registration`.`id` AS `registration_id`, `registration`.`user_id`, `registration`.`create_time`,
  49. `class_list`.`id` AS `class_list_id`,`class_list`.`start_time`, `class_list`.`end_time`, `fee_method`,
  50. `class_name`.`id` AS `class_name_id`, `class_name`.`name`, `school_id`, `category`,`group_id`,`group_sort`, `is_inner`
  51. FROM `registration`
  52. JOIN `user_information` ON `registration`.`user_id` = `user_information`.`user_id`
  53. JOIN `class_list` ON `registration`.`event_id` = `class_list`.`id`
  54. JOIN `class_name` ON `class_list`.`name_id` = `class_name`.`id`
  55. """
  56. # based on 'attend_record'
  57. '''
  58. sql_query = """
  59. SELECT
  60. `user_information`.`id` AS `user_information_id`,`birthday`, `gender`, `position`,
  61. `attend_record`.`id` AS `attend_record_id`, `class_detail_id`, `attend_record`.`user_id`, `is_attend`,
  62. `class_detail`.`start_time` ,`class_detail`.`end_time`, `sessions`, `hour`,
  63. `class_list`.`id` AS `class_list_id`, `event`, `fee_method`,
  64. `class_name`.`id` AS `class_name_id`, `class_name`.`name`, `school_id`, `category`,`group_id`,`group_sort`, `is_inner`
  65. FROM `attend_record`
  66. JOIN `user_information` ON `attend_record`.`user_id` = `user_information`.`user_id`
  67. JOIN `class_detail` ON `attend_record`.`class_detail_id` = `class_detail`.`id`
  68. JOIN `class_list` ON `class_detail`.`class_list_id` = `class_list`.`id`
  69. JOIN `class_name` ON `class_list`.`name_id` = `class_name`.`id`
  70. """
  71. '''
  72. match_data = call_sql(sql_query)
  73. return match_data
  74. def student_info():
  75. def fix_dict_format(s):
  76. if isinstance(s, str):
  77. return s.encode().decode('unicode_escape')
  78. pairs = re.findall(r"'([^']+)':(\d+)", s)
  79. fixed_dict = "{" + ", ".join([f"'{key}': {value}" for key, value in pairs]) + "}"
  80. return fixed_dict
  81. else:
  82. return s
  83. # 讀檔,把 position 轉成 dictionary 格式
  84. #user_information = pd.read_csv(file)
  85. sql_query = """
  86. SELECT * FROM `user_information`
  87. """
  88. user_information = call_sql(sql_query)
  89. user_information['position'] = user_information['position'].apply(fix_dict_format)
  90. user_information['position'].fillna('{}', inplace=True)
  91. user_information['position'] = user_information['position'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else {})
  92. # only學員
  93. student_information = user_information[user_information['position'].apply(lambda x: x.get('學員', 0)==1)]
  94. #user_information[user_information['gender'].str.strip() != '']['gender']
  95. return student_information
  96. def count_target_number(df, target, order=None):
  97. item_counts = df[target].value_counts(dropna=True)
  98. result_df = item_counts.reset_index()
  99. result_df.columns = [target, 'Count']
  100. nan_count = df[target].isna().sum()
  101. if nan_count > 0:
  102. nan_df = pd.DataFrame({target: [np.nan], 'Count': [nan_count]})
  103. result_df = pd.concat([result_df, nan_df], ignore_index=True)
  104. if order:
  105. result_df[target] = pd.Categorical(result_df[target], categories=order, ordered=True)
  106. result_df = result_df.sort_values(by=target)
  107. item_list = result_df[target].replace('', '無類別名稱').astype(str).tolist()
  108. count_list = result_df['Count'].astype(str).tolist()
  109. search_results = {target: str(item_list), "Count": str(count_list)}
  110. return search_results
  111. def search_specific_items(df, target, search_items):
  112. if not search_items:
  113. return count_target_number(df, target)
  114. all_items = df[target].unique
  115. not_in_list = [name for name in search_items if name not in all_items]
  116. if not_in_list:
  117. return {target: str(not_in_list), "Error": "Not in list"}
  118. item_counts = df[df[target].isin(search_items)][target].value_counts()
  119. item_list = item_counts.index.astype(str).tolist()
  120. count_list = item_counts.values.tolist()
  121. search_results = {"Item": item_list, "Count": count_list}
  122. return search_results
  123. @data.post("/age_bar")
  124. async def age_bar(age_group_list: Optional[List[int]] = [18, 25, 35, 45, 55, 65]):
  125. def create_labels(age_group_list):
  126. labels = [f"{start}-{end-1}" if end != float('inf') else f"{start}+" for start, end in zip(age_group_list[1:-2], age_group_list[2:-1])]
  127. labels = [f'{str(age_group_list[1]-1)}-'] + labels + [f'{str(age_group_list[-2])}+']
  128. return labels
  129. user_information = student_info()
  130. # 生日
  131. birthday = pd.to_datetime(user_information['birthday'], format='%Y-%m-%d')
  132. current_date = pd.to_datetime('today')
  133. user_information['age'] = ((current_date - birthday).dt.days / 365).round(1).astype(int)
  134. bins = [0] + (age_group_list) + [float('inf')]
  135. labels = create_labels(bins)
  136. user_information['age_group'] = pd.cut(user_information['age'], bins=bins, labels=labels, right=False)
  137. age_group_counts = user_information['age_group'].value_counts(sort=False)
  138. age_group_label = labels
  139. count_list = age_group_counts.values.tolist()
  140. search_results = {"Age_group" : str(age_group_label), "Count": str(count_list)}
  141. # 按年齡分群
  142. # bins = [0, 18, 25, 35, 45, 55, 65, float('inf')]
  143. # labels = ['18-', '18-24', '25-34', '35-44', '45-54', '55-64', '65+']
  144. #user_information['age_group'] = pd.cut(user_information['age'], bins=bins, labels=labels, right=False)
  145. return search_results
  146. @data.post("/gender_bar")
  147. def gender_bar(gender_type: Optional[str] = ''):
  148. user_information = student_info()
  149. if gender_type.strip() in ["男", "女"]:
  150. user_information = user_information[user_information['gender'] == gender_type]
  151. gender_counts = (user_information[user_information['gender'].str.strip() != '']['gender']).value_counts()
  152. gender_list = gender_counts.index.astype(str).tolist()
  153. count_list = gender_counts.values.tolist()
  154. search_results = {"Gender" : str(gender_list), "Count": str(count_list)}
  155. return search_results
  156. @data.post("/reg_gender")
  157. async def reg_gender():
  158. target = 'gender'
  159. match_data = match()
  160. return count_target_number(match_data, 'gender')
  161. @data.post("/reg_age")
  162. async def reg_age(age_group_list: Optional[List[int]] = [18, 25, 35, 45, 55, 65]):
  163. def create_labels(age_group_list):
  164. labels = [f"{start}-{end-1}" if end != float('inf') else f"{start}+" for start, end in zip(age_group_list[1:-2], age_group_list[2:-1])]
  165. labels = [f'{str(age_group_list[1]-1)}-'] + labels + [f'{str(age_group_list[-2])}+']
  166. return labels
  167. match_data = match()
  168. # 生日
  169. birthday = pd.to_datetime(match_data['birthday'], format='%Y-%m-%d')
  170. current_date = pd.to_datetime('today')
  171. match_data['age'] = ((current_date - birthday).dt.days / 365).round(1).astype(int)
  172. bins = [0] + (age_group_list) + [float('inf')]
  173. labels = create_labels(bins)
  174. match_data['age_group'] = pd.cut(match_data['age'], bins=bins, labels=labels, right=False)
  175. age_group_counts = match_data['age_group'].value_counts(sort=False)
  176. age_group_label = labels
  177. count_list = age_group_counts.values.tolist()
  178. search_results = {"Age_group" : str(age_group_label), "Count": str(count_list)}
  179. # 按年齡分群
  180. # bins = [0, 18, 25, 35, 45, 55, 65, float('inf')]
  181. # labels = ['18-', '18-24', '25-34', '35-44', '45-54', '55-64', '65+']
  182. #user_information['age_group'] = pd.cut(user_information['age'], bins=bins, labels=labels, right=False)
  183. return search_results
  184. def time_analysis_search(df, target, search_items):
  185. def time_analysis(df, target, name):
  186. df = df[df[target] == name]
  187. df['start_time'] = pd.to_datetime(df['start_time'])
  188. df['month'] = df['start_time'].dt.to_period('M')
  189. monthly_counts = df['month'].value_counts().sort_index()
  190. dates_list = monthly_counts.index.astype(str).tolist()
  191. count_list = monthly_counts.values.tolist()
  192. return (dates_list, count_list)
  193. all_items = df[target].unique()
  194. search_results = []
  195. for name in search_items or all_items:
  196. if name not in all_items:
  197. search_results.append({target: name, "Error": "Not in list"})
  198. else:
  199. dates_list, count_list = time_analysis(df, target, name)
  200. search_results.append({target: name, "Month": str(dates_list), "Count": str(count_list)})
  201. return search_results
  202. @data.post("/category_line")
  203. async def category_line(category_name: Optional[str] = ''):
  204. target = 'category'
  205. search_items = category_name.split()
  206. match_data = match()
  207. search_results = time_analysis_search(match_data, target, search_items)
  208. return search_results
  209. #@data.post("/group_line")
  210. async def group_line(group_id: Optional[str] = ''):
  211. #group_name_id = {'未來工藝學群': 1, '技藝工藝學群': 2, '生活工藝學群': 3, '青年工藝學群': 4, '世代工藝學群': 5,
  212. # '修護工藝學群': 6, '跨域工藝學群': 7, '線上工藝學群': 8, '希望工程學群': 9}
  213. target = 'group_id'
  214. search_items = group_id.split()
  215. try:
  216. search_items = [int(item) for item in search_items]
  217. except ValueError as e:
  218. return {"msg": str(e)}
  219. match_data = match()
  220. search_results = time_analysis_search(match_data, target, search_items)
  221. return search_results
  222. @data.post("/category_bar")
  223. async def category_bar(category_name: Optional[str] = ''):
  224. target = 'category'
  225. search_items = category_name.split()
  226. match_data = match()
  227. search_results = search_specific_items(match_data, target, search_items)
  228. return search_results
  229. @data.post("/group_bar")
  230. async def group_bar(group_id: Optional[str] = ''):
  231. #group_name_id = {'未來工藝學群': 1, '技藝工藝學群': 2, '生活工藝學群': 3, '青年工藝學群': 4, '世代工藝學群': 5,
  232. # '修護工藝學群': 6, '跨域工藝學群': 7, '線上工藝學群': 8, '希望工程學群': 9}
  233. target = 'group_id'
  234. search_items = group_id.split()
  235. try:
  236. search_items = [int(item) for item in search_items]
  237. except ValueError as e:
  238. return {"msg": str(e)}
  239. match_data = match()
  240. search_results = search_specific_items(match_data, target, search_items)
  241. return search_results
  242. @data.post("/first_course_bar")
  243. async def first_course_bar(by_class_name: Optional[int] = 0, by_category: Optional[int] = 0):
  244. if by_class_name == by_category == 1:
  245. return 'Please select only one.'
  246. elif by_class_name == by_category == 0:
  247. return 'Please select one.'
  248. elif by_class_name not in [0, 1] or by_category not in [0, 1]:
  249. return 'Please input 0 or 1.'
  250. match_data = match()
  251. match_data['create_time'] = pd.to_datetime(match_data['create_time'])
  252. earliest_records = match_data.groupby('user_id').agg({'create_time': 'min', 'name': 'first', 'category': 'first'}).reset_index()
  253. target = 'name' if by_class_name else 'category'
  254. first_class_counts = earliest_records[target].value_counts()
  255. course_list = first_class_counts.index.tolist()
  256. count_list = first_class_counts.values.tolist()
  257. return {target: str(course_list), "Count": str(count_list)}
  258. def repeat_time_search(df, target):
  259. def repeat_time_analysis(df, target, name):
  260. df = df[df[target] == name]
  261. counts = df['user_id'].value_counts()
  262. repeat_times = counts.value_counts().sort_index()
  263. repeat_time_list = repeat_times.index.tolist()
  264. count_list = np.array(repeat_times.values).flatten().tolist()
  265. return (repeat_time_list, count_list)
  266. all_items = df[target].unique()
  267. search_results = []
  268. for name in all_items:
  269. repeat_time_list, count_list = repeat_time_analysis(df, target, name)
  270. search_results.append({target: name, 'repeat_times': str(repeat_time_list), 'count': str(count_list)})
  271. return search_results
  272. @data.post("/repeat_participation_bar")
  273. async def repeat_participation_bar(by_class_name: Optional[int] = 0, by_category: Optional[int] = 0):
  274. if by_class_name == by_category == 1:
  275. return 'Please select only one.'
  276. elif by_class_name == by_category == 0:
  277. return 'Please select one.'
  278. elif by_class_name not in [0, 1] or by_category not in [0, 1]:
  279. return 'Please input 0 or 1.'
  280. target = 'name' if by_class_name else 'category'
  281. #search_items = [name.strip("'\"") for name in class_name.split()]
  282. match_data = match()
  283. search_results = repeat_time_search(match_data, target)
  284. return search_results
  285. @data.post("/school_city_pie")
  286. async def school_city_pie():
  287. #school = pd.read_csv(SCHOOL_FILE)
  288. sql_query = """
  289. SELECT * FROM `schools`
  290. """
  291. school = call_sql(sql_query)
  292. def extract_county_city(address):
  293. pattern = r"(.*?[縣市])"
  294. if isinstance(address, str):
  295. address = address.replace("臺", "台")
  296. match = re.search(pattern, address)
  297. if match:
  298. return match.group(1)[-3:]
  299. return None
  300. school['county_city'] = school['address'].apply(extract_county_city)
  301. city_counts = (school[school['county_city'].str.strip() != '']['county_city']).value_counts()
  302. total_count = city_counts.sum()
  303. #city_percentages = city_counts / total_count * 100
  304. threshold_percent = 0.01
  305. low_county = city_counts[city_counts / total_count < threshold_percent]
  306. city_counts['其他'] = low_county.sum()
  307. city_counts = city_counts[city_counts / total_count >= threshold_percent]
  308. result = {'City':str(city_counts.index.tolist()), 'Number of school':str(city_counts.values.tolist())}
  309. return result
  310. @data.post("/class_name_category_pie")
  311. async def class_name_category_pie():
  312. sql_query = """
  313. SELECT * FROM `class_name`
  314. """
  315. class_name = call_sql(sql_query)
  316. results = count_target_number(class_name, 'category')
  317. return results
  318. #@data.post("/class_detail_category_pie")
  319. async def class_detail_category_pie():
  320. sql_query = """
  321. SELECT `class_detail`.`id` AS `class_detail_id`, `class_detail`.`class_list_id`, `class_detail`.`start_time`, `class_list`.`name_id`,
  322. `class_name`.`name`, `class_name`.`school_id`, `class_name`.`category`, `class_name`.`group_id`
  323. FROM `class_detail`
  324. JOIN `class_list` ON `class_detail`.`class_list_id` = `class_list`.`id`
  325. JOIN `class_name` ON `class_list`.`name_id` = `class_name`.`id`
  326. """
  327. match_data = call_sql(sql_query)
  328. results = count_target_number(match_data, 'category')
  329. return results
  330. #@data.post("/registration_list_name")
  331. async def registration_list_name(category: Optional[int] = 0, group_id: Optional[int] = 0):
  332. if category == group_id == 1:
  333. return 'Please select only one.'
  334. elif category == group_id == 0:
  335. return 'Please select one.'
  336. sql_query = """
  337. SELECT `registration`.`event_id`, `class_list`.`name_id`, `class_name`.`name`, `class_name`.`school_id`, `class_name`.`category`, `class_name`.`group_id`
  338. FROM `registration`
  339. JOIN `class_list` ON `registration`.`event_id` = `class_list`.`id`
  340. JOIN `class_name` ON `class_list`.`name_id` = `class_name`.`id`
  341. """
  342. user_event = call_sql(sql_query)
  343. if category:
  344. return count_target_number(user_event, 'category')
  345. if group_id:
  346. return count_target_number(user_event, 'group_id')