topic_tree_crawler.py 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. import numpy as np
  2. import pandas as pd
  3. import dataset
  4. import datetime
  5. import time
  6. # import the TrendReq method from the pytrends request module
  7. from pytrends.request import TrendReq
  8. def transform_time(array):
  9. ans = []
  10. for i in range(len(array)):
  11. ans.append(datetime.datetime.strptime(str(array[i])[:10], "%Y-%m-%d"))
  12. return np.array(ans)
  13. def df_to_db(df):
  14. columns = ['iot_kword', 'iot_date', 'iot_value', 'iot_dtime']
  15. length = len(df)
  16. iot_kword = df.columns[0]
  17. to_db_df = pd.DataFrame(columns=columns)
  18. iot_kwords = len(df) * [iot_kword]
  19. iot_dates = transform_time(df.index.values)
  20. iot_values = df[iot_kword].values
  21. iot_dtime = datetime.datetime.utcnow() + datetime.timedelta(hours=8)
  22. to_db_df['iot_kword'] = iot_kwords
  23. to_db_df['iot_date'] = iot_dates
  24. to_db_df['iot_value'] = iot_values
  25. to_db_df['iot_dtime'] = iot_dtime
  26. return to_db_df
  27. def get_table(table_name, db_name):
  28. db = dataset.connect(f'mysql://choozmo:pAssw0rd@db.ptt.cx:3306/{db_name}?charset=utf8mb4')
  29. table = db[table_name]
  30. return table
  31. def data_to_db(table, data):
  32. rows = []
  33. columns = ['iot_kword', 'iot_date', 'iot_value', 'iot_dtime']
  34. for i in range(len(data)):
  35. rows.append({})
  36. for j, column in enumerate(data.columns):
  37. rows[i][column] = data.iloc[i, j]
  38. print('db updating...')
  39. table.insert_many(rows)
  40. print('db updated.')
  41. def crawler_iot_topic_tree(keywords, timeframe='today 3-m'):
  42. """
  43. 輸入keywords,
  44. """
  45. pytrend = TrendReq()
  46. for keyword in keywords:
  47. print(f'{keyword} 資料抓取中...')
  48. pytrend.build_payload(
  49. kw_list=[keyword],
  50. cat=0,
  51. timeframe=timeframe,
  52. geo='TW',
  53. gprop=''
  54. )
  55. to_topics_interest_over_time = pytrend.interest_over_time()
  56. data = df_to_db(to_topics_interest_over_time)
  57. table = get_table('topic_tree_g_trend_iot', 'cmm_test')
  58. data_to_db(table, data)
  59. time.sleep(5)