{ "cells": [ { "cell_type": "markdown", "id": "795dcb47", "metadata": {}, "source": [ "# Load data from database" ] }, { "cell_type": "code", "execution_count": 1, "id": "09956185", "metadata": {}, "outputs": [], "source": [ "import time\n", "import pymysql\n", "pymysql.install_as_MySQLdb()\n", "# import MySQLdb\n", "import dataset\n", "import pandas as pd\n", "import pickle\n", "from collections import Counter\n", "import numpy as np\n", "from random import sample\n", "\n", "import os\n", "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "start_time = time.time()" ] }, { "cell_type": "code", "execution_count": 2, "id": "2b0900b3", "metadata": {}, "outputs": [], "source": [ "db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/hhh?charset=utf8mb4')\n", "result = db.query('SELECT * FROM gnews.gnews_detail ')" ] }, { "cell_type": "code", "execution_count": 3, "id": "51436059", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of news: 4286\n" ] } ], "source": [ "data = pd.DataFrame(result, columns=next(iter(result)).keys())\n", "print('Number of news:',len(data))" ] }, { "cell_type": "code", "execution_count": 4, "id": "8c16f7da", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Drop duplicates ...Number of news: 3678\n" ] } ], "source": [ "print('Drop duplicates ...', end='')\n", "data = data.drop_duplicates(subset=['news_content'], keep='first').drop_duplicates(subset=['news_url'], keep='first')\n", "print('Number of news:',len(data))" ] }, { "cell_type": "code", "execution_count": 5, "id": "928c2993", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Paddle enabled successfully......\n", "Building prefix dict from /home/mmt/ccc/ChoozMo/gnews_keyword_extraction/dict.txt.big ...\n", "Loading model from cache /tmp/jieba.uc1cd92f76284b553fb5ffce9f1723528.cache\n", "Loading model cost 0.713 seconds.\n", "Prefix dict has been built successfully.\n" ] } ], "source": [ "import json\n", "import operator\n", "from typing import List, Tuple, Optional\n", "import os\n", "import jieba\n", "import jieba.posseg as pseg\n", "import jieba.analyse\n", "\n", "import paddle\n", "paddle.enable_static()\n", "\n", "jieba.enable_paddle()\n", "\n", "\n", "jieba.set_dictionary('dict.txt.big')\n", "\n", "jieba.load_userdict('jieba_add_word.txt')\n", "jieba.load_userdict('jieba_add_word_kw_with_weighting.txt')\n", "\n", "check_pos_list=[]\n", "\n", "# Check if contains num\n", "def notNumStr(instr):\n", " for item in instr:\n", " if '\\u0041' <= item <= '\\u005a' or ('\\u0061' <= item <='\\u007a') or item.isdigit():\n", " return False\n", " return True\n", "\n", "# Read Target Case if Json\n", "def readSingleTestCases(testFile):\n", " with open(testFile) as json_data:\n", " try:\n", " testData = json.load(json_data)\n", " except:\n", " # This try block deals with incorrect json format that has ' instead of \"\n", " data = json_data.read().replace(\"'\",'\"')\n", " try:\n", " testData = json.loads(data)\n", " # This try block deals with empty transcript file\n", " except:\n", " return \"\"\n", " returnString = \"\"\n", " for item in testData:\n", " try:\n", " returnString += item['text']\n", " except:\n", " returnString += item['statement']\n", " return returnString\n", "\n", "class Word():\n", " def __init__(self, char, freq = 0, deg = 0):\n", " self.freq = freq\n", " self.deg = deg\n", " self.char = char\n", "\n", " def returnScore(self):\n", " return self.deg/self.freq\n", "\n", " def updateOccur(self, phraseLength):\n", " self.freq += 1\n", " self.deg += phraseLength\n", "\n", " def getChar(self):\n", " return self.char\n", "\n", " def updateFreq(self):\n", " self.freq += 1\n", "\n", " def getFreq(self):\n", " return self.freq\n", "\n", "class Rake:\n", "\n", " def __init__(self): # , stopwordPath: str = None, delimWordPath: str = None):\n", " # If both Found and Initialized\n", " self.initialized = False\n", " self.stopWordList = list()\n", " self.delimWordList = list()\n", "\n", " def initializeFromPath(self, stopwordPath: str = \"\", delimWordPath: str = \"\"):\n", " if not os.path.exists(stopwordPath):\n", " print(\"Stop Word Path invalid\")\n", " return\n", "\n", " if not os.path.exists(delimWordPath):\n", " print(\"Delim Word Path Invalid\")\n", " return\n", "\n", " \n", " swLibList = [line.rstrip('\\n') for line in converter.convert(open(stopwordPath,'r').read()).split('\\n')]\n", " conjLibList = [line.rstrip('\\n') for line in converter.convert(open(delimWordPath,'r').read()).split('\\n')]\n", " \n", " \n", " self.initializeFromList(swLibList, conjLibList)\n", " return\n", " \n", " def initializeFromList(self, swList : List = None, dwList : List = None):\n", " self.stopWordList = swList\n", " self.delimWordList = dwList\n", " \n", " if len(self.stopWordList) == 0 or len(self.delimWordList) == 0:\n", " print(\"Empty Stop word list or deliminator word list, uninitialized\")\n", " return\n", " else:\n", " self.initialized = True\n", "\n", " def extractKeywordFromPath(self, text : str, num_kw : int = 10):\n", " if not self.initialized:\n", " print(\"Not initialized\")\n", " return \n", "\n", " with open(text,'r') as fp:\n", " text = fp.read()\n", " return self.extractKeywordFromString(text, num_kw = num_kw)\n", " \n", " def extractKeywordFromString(self, text : str, num_kw : int = 10):\n", " rawtextList = pseg.cut(text)\n", " \n", " # Construct List of Phrases and Preliminary textList\n", " textList = []\n", " listofSingleWord = dict()\n", " lastWord = ''\n", "\n", " # for jieba\n", " poSPrty = ['zg','m','x','uj','ul','mq','u','v','f','t','vd','q','r','d','p','nr','r'\n", " 'c','TIME','xc','a','ad','an','nrt','df','b','vn','l','y','o','i']\n", "\n", " meaningfulCount = 0\n", " checklist = []\n", " for eachWord, flag in rawtextList:\n", " \n", " check_pos_list.append(str(eachWord)+'/'+str(flag))\n", " \n", " checklist.append([eachWord,flag])\n", " if eachWord in self.delimWordList or not notNumStr(eachWord) or eachWord in self.stopWordList or flag in poSPrty or eachWord == '\\n':\n", " if lastWord != '|':\n", " textList.append(\"|\")\n", " lastWord = \"|\"\n", " elif eachWord not in self.stopWordList and eachWord != '\\n':\n", " textList.append(eachWord)\n", " meaningfulCount += 1\n", " if eachWord not in listofSingleWord:\n", " listofSingleWord[eachWord] = Word(eachWord)\n", " lastWord = ''\n", "\n", " # Construct List of list that has phrases as wrds\n", " newList = []\n", " tempList = []\n", " for everyWord in textList:\n", " if everyWord != '|':\n", " tempList.append(everyWord)\n", " else:\n", " newList.append(tempList)\n", " tempList = []\n", "\n", " tempStr = ''\n", " for everyWord in textList:\n", " if everyWord != '|':\n", " tempStr += everyWord + '|'\n", " else:\n", " if tempStr[:-1] not in listofSingleWord:\n", " listofSingleWord[tempStr[:-1]] = Word(tempStr[:-1])\n", " tempStr = ''\n", "\n", " # Update the entire List\n", " for everyPhrase in newList:\n", " res = ''\n", " for everyWord in everyPhrase:\n", " listofSingleWord[everyWord].updateOccur(len(everyPhrase))\n", " res += everyWord + '|'\n", " phraseKey = res[:-1]\n", " if phraseKey not in listofSingleWord:\n", " listofSingleWord[phraseKey] = Word(phraseKey)\n", " else:\n", " listofSingleWord[phraseKey].updateFreq()\n", "\n", " # Get score for entire Set\n", " outputList = dict()\n", " for everyPhrase in newList:\n", "\n", " if len(everyPhrase) > 5:\n", " continue\n", " score = 0\n", " phraseString = ''\n", " outStr = ''\n", " for everyWord in everyPhrase:\n", " score += listofSingleWord[everyWord].returnScore()\n", " phraseString += everyWord + '|'\n", " outStr += everyWord\n", " phraseKey = phraseString[:-1]\n", " freq = listofSingleWord[phraseKey].getFreq()\n", "\n", " if freq / meaningfulCount < 0.05 and freq < 3 :\n", " continue\n", " outputList[outStr] = score\n", " \n", " sorted_list = sorted(outputList.items(), key = operator.itemgetter(1), reverse = True)\n", " sorted_list = [s[0] for s in sorted_list]\n", " return sorted_list[:num_kw]" ] }, { "cell_type": "code", "execution_count": 6, "id": "cb3b8175", "metadata": { "scrolled": true }, "outputs": [], "source": [ "import random\n", "\n", "from nltk.corpus import stopwords\n", "import time\n", "import opencc\n", "\n", "converter = opencc.OpenCC('s2t.json')\n", "converter.convert('汉字') # 漢字\n", "cc_stopwords = converter.convert(open(\"cn_stopwords.txt\", \"r\").read()).split('\\n')\n", "\n", "topK = 80\n", "\n", "obj = Rake()\n", "stop_path = \"./stoplist/中文停用词表(1208个).txt\"\n", "conj_path = \"./stoplist/中文分隔词词库.txt\"\n", "obj.initializeFromPath(stop_path, conj_path)\n", "\n", "\n", "\n", "# 创建一个停用词列表\n", "with open('customized_stopwords.pickle', 'rb') as handle:\n", " customized_stopwords = pickle.load(handle)\n", "customized_stopwords.extend(stopwords.words('english'))\n", "customized_stopwords.extend(cc_stopwords)" ] }, { "cell_type": "code", "execution_count": 7, "id": "4461d197", "metadata": { "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2021-07-16 18:10:05.526192: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "【Transformer】Documents embedding ... DONE!\n", "embeddings.shape: (3678, 512)\n" ] } ], "source": [ "from sentence_transformers import SentenceTransformer, util\n", "from transformers import AutoTokenizer, AutoModel, BertTokenizerFast\n", "import torch\n", "import umap\n", "from more_itertools import ichunked\n", "\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "\n", "\n", "# documents embedding\n", "print('【Transformer】Documents embedding ... ',end='')\n", "model = SentenceTransformer('distiluse-base-multilingual-cased-v1')\n", "\n", "embeddings = model.encode(data['news_content'].tolist())\n", "print('DONE!')\n", "print('embeddings.shape:',embeddings.shape)" ] }, { "cell_type": "code", "execution_count": 8, "id": "b4f7c1cb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "【HDBSCAN】Clustering ...DONE!\n", " --------------------------------------------------------------------------------\n", "Number of clusterers: 34\n", "Noise ratio: 90.3 % 3321 / 3678\n", "Clusterers:\n", "{-1: 3321,\n", " 0: 7,\n", " 1: 5,\n", " 2: 17,\n", " 3: 5,\n", " 4: 6,\n", " 5: 6,\n", " 6: 6,\n", " 7: 8,\n", " 8: 9,\n", " 9: 7,\n", " 10: 7,\n", " 11: 8,\n", " 12: 6,\n", " 13: 6,\n", " 14: 9,\n", " 15: 5,\n", " 16: 6,\n", " 17: 5,\n", " 18: 5,\n", " 19: 6,\n", " 20: 5,\n", " 21: 5,\n", " 22: 7,\n", " 23: 10,\n", " 24: 9,\n", " 25: 10,\n", " 26: 37,\n", " 27: 10,\n", " 28: 14,\n", " 29: 9,\n", " 30: 17,\n", " 31: 77,\n", " 32: 8}\n", "--------------------------------------------------------------------------------\n", "Predict doc:\n", "... ,更令殘破公屋變得有私樓感覺。「新裝修」日前分享一宗裝修案例,屋主一家居於屯門山景邨一個約500呎單位37年,育有4名女兒,原本一家六口同住,已透過租者置其屋計劃購入單位。由於兩名女兒先後結婚搬走,僅 ...\n", " --------------------------------------------------------------------------------\n", "Predict_doc (target domain) is predicted to be in cluster # -1 (noise)\n", " -> Replace with the largest cluster # 31\n", "--------------------------------------------------------------------------------\n" ] } ], "source": [ "import hdbscan\n", "import matplotlib.pyplot as plt\n", "from collections import Counter\n", "from itertools import compress\n", "\n", "from collections import Counter\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import matplotlib.colors as mcolors\n", "from matplotlib.pyplot import figure\n", "\n", "import pprint\n", "\n", "# HDBSCAN clustering\n", "print('【HDBSCAN】Clustering ...',end='')\n", "hclusterer = hdbscan.HDBSCAN(prediction_data=True).fit(embeddings) #embeddings_list\n", "print('DONE!\\n','-'*80)\n", "print('Number of clusterers:',len(Counter(hclusterer.labels_)))\n", "print('Noise ratio:',round(list(hclusterer.labels_).count(-1) / len(embeddings),3)*100,'% ',list(hclusterer.labels_).count(-1),'/',len(embeddings))\n", "print('Clusterers:')\n", "pprint.pprint(dict(Counter(hclusterer.labels_)))\n", "print('-'*80)\n", "\n", "\n", "# approximate predict cluster to find target domain\n", "predict_doc = open(\"predict_doc.txt\", \"r\").read()\n", "print('Predict doc:')\n", "print('...',predict_doc[50:150], '...\\n','-'*80)\n", "\n", "\n", "test_labels, strengths = hdbscan.approximate_predict(hclusterer, model.encode([predict_doc]))\n", "\n", "\n", "target_domain_cluster = test_labels[0]\n", "print('Predict_doc (target domain) is predicted to be in cluster #',target_domain_cluster,end='')\n", "if target_domain_cluster == -1:\n", " temp = Counter(hclusterer.labels_)\n", " del temp[-1]\n", " target_domain_cluster = max(temp.items(), key=operator.itemgetter(1))[0]\n", " print(' (noise)\\n -> Replace with the largest cluster #',target_domain_cluster)\n", " \n", "labels, values = zip(*sorted(Counter(hclusterer.labels_[hclusterer.labels_!=-1]).items()))\n", "print('-'*80)" ] }, { "cell_type": "code", "execution_count": 9, "id": "e40ae708", "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "num of news in the cluster # 31 : 77\n", "news in the cluster # 31 :\n", "\n", "... 宜居住,近期便出現不少小3房格局,最小達28坪、最低總價568萬元就能買到2+1房。專家則提醒,極端小宅得犧牲部分居住空間,室內面積、尺寸可先詢問清楚,避免成屋後空間感落差過大,最怕臥室連雙人床都擺不 ...\n", " --------------------------------------------------------------------------------\n", "... 流行風,而台灣房價居高不下,高總價的房子民眾買不起,只好委屈買小宅,因此小坪數的房子漸漸成為市場的主流,專家表示,小宅雖然有銷售票房保證,但是產品本身還是有不少的缺點,民眾在挑屋選屋時仍要張大眼睛,免 ...\n", " --------------------------------------------------------------------------------\n", "... 銷告知,未來恐再也買不到裝潢屋。對此,專家說明,因房產買賣屬於財產交易,政府並不會對交易內容有太過嚴格的規定,也並未聽聞有禁止銷售裝潢屋的消息。\n", "有網友在PTT發文表示,過去曾發現有建商接連推出新案, ...\n", " --------------------------------------------------------------------------------\n", "... ,因此為省下裝潢費,可能選擇找木工進行裝潢。不過專家提醒,裝潢仍建議找室內設計師,因整體規劃不僅比木工更完善,時間拉長來看,也更省錢省力。\n", "翔翰室內裝修設計總監盧淑媛表示,早期的木工師是裝潢師,用的是 ...\n", " --------------------------------------------------------------------------------\n", "... 可能利用室內空間,會將小坪數空間分隔成2至3房,但卻可能導致客廳的部分沒有採光。就有網友在臉書《買房知識家(Q你的A)》發文詢問,「客廳無採光的房子,建議買嗎?」\n", "貼文曝光後,許多網友點出客廳無採光缺 ...\n", " --------------------------------------------------------------------------------\n", "... 家庭型態轉變與房價上漲,現在小坪數建案當道,就有網友在臉書《買房知識家(A你的Q)》發文,「想問一下,有人可以分享一下住室內坪數18-20坪、隔三房的居住感想嗎?」想知道這樣的房型,適不適合2大1小的 ...\n", " --------------------------------------------------------------------------------\n", "... 上漲,導致現今房子坪數越做越小,買方就更要求空間使用最佳化,在這種情況下雙主臥還有吸引力嗎?一名網友指出,他最近看到一間實坪50坪物件,雖然已經算是大坪數,但對於「雙主臥」設計還是頗有疑慮,也擔心市場 ...\n", " --------------------------------------------------------------------------------\n", "... 期新成屋、預售屋推案預估量高達 2435.6 億元,不僅比去年暴增約 700 億元,年增 44%,更創下統計以來新高紀錄,其中,新北市推案量也飆破千億元大關。\n", "\n", "住展研發長何世昌表示,雖然近期美、中對 ...\n", " --------------------------------------------------------------------------------\n", "... ,2房均價是2013年至今、31季以來新高,1房則是31季以來第三高,各行政區交易表現,又以桃園區件數最多、中壢區居次;專家指出,小宅當道與社會人口結構改變有關,目前以小家庭居多,國人坪數需求降低,薪 ...\n", " --------------------------------------------------------------------------------\n", "... 的規劃常常讓人傷腦筋。一名網友在PTT表示,房價越漲越貴,在這種狀況下很難買到大坪數物件,他最近就看到一個19坪建案,讓他驚呼「根本是給寵物住的」。\n", "貼文曝光後,網友回應,19坪的房子扣掉公設後才10 ...\n", " --------------------------------------------------------------------------------\n" ] } ], "source": [ "from tqdm import tqdm\n", "from random import sample\n", "\n", "\n", "# check news in the cluster\n", "cluster_num = target_domain_cluster\n", "fil = [l==cluster_num for l in hclusterer.labels_]\n", "doc_list = list(compress(data['news_content'].tolist(), fil))\n", "\n", "print('num of news in the cluster #',cluster_num,':', len(doc_list))\n", "print('news in the cluster #',cluster_num,':\\n')\n", "for d in sample(doc_list,min(10,len(doc_list))):\n", " print('...',d[50:150], '...\\n','-'*80)" ] }, { "cell_type": "code", "execution_count": 10, "id": "18b801b7", "metadata": {}, "outputs": [], "source": [ "from string import punctuation\n", "import pke\n", "\n", "def pke_MultipartiteRank(text):\n", "\n", " # initialize a TopicRank extractor\n", " extractor = pke.unsupervised.MultipartiteRank()\n", "\n", " # load the content of the document and perform French stemming\n", "\n", " extractor.load_document(input=text,\n", " language='zh',\n", " normalization=None)\n", "\n", " # keyphrase candidate selection, here sequences of nouns and adjectives\n", " # defined by the Universal PoS tagset\n", " extractor.candidate_selection(pos={\"NOUN\", \"PROPN\"}, stoplist=customized_stopwords)\n", "\n", " # candidate weighting, here using a random walk algorithm\n", " extractor.candidate_weighting(alpha=1.1,\n", " threshold=0.65,\n", " method='average')\n", "\n", " # N-best selection, keyphrases contains the 10 highest scored candidates as\n", " # (keyphrase, score) tuples\n", " keyphrases = extractor.get_n_best(n=40) #topK\n", "\n", " return [j for sub in [k[0].split() for k in keyphrases] for j in sub]\n", "\n", "def half2full(s):\n", " n = []\n", " for c in list(s):\n", " num = ord(c)\n", " if num == 320:\n", " num = 0x3000\n", " elif 0x21 <= num <= 0x7E:\n", " num += 0xfee0\n", " num = chr(num)\n", " n.append(num)\n", " return ''.join(n)\n", "\n", "def find_tags(doc_list):\n", " pos_list=['ns', 'n', 'nt', 'nz', 'x', 'ns','nrfg', 'an'] #, 'vn'\n", " punctuations = punctuation+half2full(punctuation)+'、●{}「」[]【】()()<>《》〈〉『』〔〕'\n", " tag_list=[]\n", " pbar = tqdm(range(len(doc_list)))\n", " pbar.set_description(\"[Extracting keywords...]\")\n", "\n", " fail_count = 0\n", "\n", " for d in doc_list:\n", " pbar.update()\n", " d = d.translate(d.maketrans(punctuations, ' '*len(punctuations), \"\"))\n", " try:\n", " result = []\n", " result.extend(obj.extractKeywordFromString(d, num_kw=topK))\n", " result.extend(jieba.analyse.extract_tags(d, topK=topK, allowPOS=(pos_list)))\n", " result.extend(jieba.analyse.textrank(d, topK=topK, allowPOS=(pos_list)))\n", " result.extend(pke_MultipartiteRank(d))\n", " except:\n", " result = 'a'\n", " fail_count +=1\n", " print('-'*80)\n", " print('Keywords of this news are not available:\\n','...',d[50:150], '...\\n','-'*80)\n", "\n", "\n", " tags = list(filter(lambda x: len(x)>1 and notNumStr(x) and x not in list(set(customized_stopwords)), result))\n", " tag_list.extend(tags)\n", " \n", " \n", " tag_list = list(set(tag_list))\n", " pbar.close()\n", " \n", " print('Num of keywords:', len(tag_list))\n", " print('Fail:', fail_count,'\\n')\n", " \n", " return tag_list" ] }, { "cell_type": "code", "execution_count": 11, "id": "3d384c2d", "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Num of news in the cluster # 30 #31 #32 : 102\n", "News in the cluster # 30 #31 #32 :\n", "\n", "... 範圍,在市場上非常熱門,也有人說可以買來收租,在租屋市場的價格也不錯。因此你可能會好奇,這樣的小坪數房子是否真的值得購買?\n", "\n", "▲小坪數房子是否真的值得購買?(示意圖/Pixabay)\n", "\n", "回到最根本的源 ...\n", " --------------------------------------------------------------------------------\n", "... 出「小宅」物件,盼以低總價等優勢來吸引民眾目光。網路上也有人感嘆,現在新房的空間越來越小,擔心住起來不舒適。\n", "一名網友就在PTT上發問,老家跟自家都至少3房2廳25坪以上,對四口家庭來說勉強過得去,但 ...\n", " --------------------------------------------------------------------------------\n", "... 上漲,導致現今房子坪數越做越小,買方就更要求空間使用最佳化,在這種情況下雙主臥還有吸引力嗎?一名網友指出,他最近看到一間實坪50坪物件,雖然已經算是大坪數,但對於「雙主臥」設計還是頗有疑慮,也擔心市場 ...\n", " --------------------------------------------------------------------------------\n", "Find tags from renewhouse website ...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[Extracting keywords...]: 0%| | 0/10 [00:00