{ "cells": [ { "cell_type": "markdown", "id": "795dcb47", "metadata": {}, "source": [ "# Load data from database" ] }, { "cell_type": "code", "execution_count": 2, "id": "09956185", "metadata": {}, "outputs": [], "source": [ "import time\n", "import pymysql\n", "pymysql.install_as_MySQLdb()\n", "# import MySQLdb\n", "import dataset\n", "import pandas as pd\n", "import pickle\n", "from collections import Counter\n", "import numpy as np\n", "from random import sample\n", "\n", "start_time = time.time()" ] }, { "cell_type": "code", "execution_count": 4, "id": "2b0900b3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnews_urlnews_contentnews_daycrawler_date
02https://www.worldjournal.com/wj/story/121362/5...南加地產火熱,但建材漲,人工貴,房價順勢上揚。(記者李雪/攝影)\\n\\n新冠疫情 掀起一場風...None2021年05月23日
13https://www.chinatimes.com/newspapers/20210505...水泥為所有建設、工程主要原料,又是大宗民生物資之一,在水泥價格預期調漲下,市場指出,水泥漲價...2021-05-05 04:10:00+2021年05月23日
24https://www.singtao.ca/4950752/2021-05-19/news...舉報\\n\\n疫情爆發以來,本材價格急升超過三倍。 CBC\\n\\n【星島綜合報道】北美建材價格...2021-05-19 00:00:002021年05月23日
35https://tw.appledaily.com/property/20210430/LE...房地產業做為台灣經濟的火車頭,從土地、建物、建材、家具乃至設計裝潢產業等,市場皆不容小覷。2...2021-04-30 00:00:002021年05月23日
46https://news.sina.com.tw/article/20210522/3864...同意 AGREE\\n\\n如果您繼續閱讀,視同您同意我們隱私條款。This website u...2021-05-22 10:02:11+2021年05月23日
\n", "
" ], "text/plain": [ " id news_url \\\n", "0 2 https://www.worldjournal.com/wj/story/121362/5... \n", "1 3 https://www.chinatimes.com/newspapers/20210505... \n", "2 4 https://www.singtao.ca/4950752/2021-05-19/news... \n", "3 5 https://tw.appledaily.com/property/20210430/LE... \n", "4 6 https://news.sina.com.tw/article/20210522/3864... \n", "\n", " news_content news_day \\\n", "0 南加地產火熱,但建材漲,人工貴,房價順勢上揚。(記者李雪/攝影)\\n\\n新冠疫情 掀起一場風... None \n", "1 水泥為所有建設、工程主要原料,又是大宗民生物資之一,在水泥價格預期調漲下,市場指出,水泥漲價... 2021-05-05 04:10:00+ \n", "2 舉報\\n\\n疫情爆發以來,本材價格急升超過三倍。 CBC\\n\\n【星島綜合報道】北美建材價格... 2021-05-19 00:00:00 \n", "3 房地產業做為台灣經濟的火車頭,從土地、建物、建材、家具乃至設計裝潢產業等,市場皆不容小覷。2... 2021-04-30 00:00:00 \n", "4 同意 AGREE\\n\\n如果您繼續閱讀,視同您同意我們隱私條款。This website u... 2021-05-22 10:02:11+ \n", "\n", " crawler_date \n", "0 2021年05月23日 \n", "1 2021年05月23日 \n", "2 2021年05月23日 \n", "3 2021年05月23日 \n", "4 2021年05月23日 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "db = dataset.connect('mysql://choozmo:pAssw0rd@db.ptt.cx:3306/hhh?charset=utf8mb4')\n", "result = db.query('SELECT * FROM gnews.gnews_detail ')\n", "data = pd.DataFrame(result, columns=next(iter(result)).keys())\n", "data" ] }, { "cell_type": "code", "execution_count": 5, "id": "e6458138", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id 0\n", "news_url 0\n", "news_content 0\n", "news_day 0\n", "crawler_date 0\n", "dtype: int64\n" ] } ], "source": [ "data.isna().sum()" ] }, { "cell_type": "code", "execution_count": 7, "id": "8c16f7da", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnews_urlnews_contentnews_daycrawler_date
02https://www.worldjournal.com/wj/story/121362/5...南加地產火熱,但建材漲,人工貴,房價順勢上揚。(記者李雪/攝影)\\n\\n新冠疫情 掀起一場風...None2021年05月23日
13https://www.chinatimes.com/newspapers/20210505...水泥為所有建設、工程主要原料,又是大宗民生物資之一,在水泥價格預期調漲下,市場指出,水泥漲價...2021-05-05 04:10:00+2021年05月23日
24https://www.singtao.ca/4950752/2021-05-19/news...舉報\\n\\n疫情爆發以來,本材價格急升超過三倍。 CBC\\n\\n【星島綜合報道】北美建材價格...2021-05-19 00:00:002021年05月23日
35https://tw.appledaily.com/property/20210430/LE...房地產業做為台灣經濟的火車頭,從土地、建物、建材、家具乃至設計裝潢產業等,市場皆不容小覷。2...2021-04-30 00:00:002021年05月23日
46https://news.sina.com.tw/article/20210522/3864...同意 AGREE\\n\\n如果您繼續閱讀,視同您同意我們隱私條款。This website u...2021-05-22 10:02:11+2021年05月23日
..................
42814290https://www.businesstoday.com.tw/article/categ...黃之揚先前做過許多工作,卻都難以持久,怎知一碰見海洋垃圾,就成了他終身志業的所在。\\n\\n來...2019-11-08 11:38:20+2021年06月13日
42824291https://house.udn.com/house/story/11134/4221093獎!獎!獎!清景麟「白易居THE ARCH」 獲三項國際設計獎\\n\\n撰文.攝影/張世雅\\n...None2021年06月13日
42834292https://www.ettoday.net/news/20190928/1543997.htm文/時尚家居 空間設計暨圖片提供/東之光室內裝修設計\\n\\n\\n\\n《2019艾特獎 最佳辦...2019-09-28 15:00:002021年06月13日
42844293https://www.epochtimes.com/b5/19/6/13/n1132041...室內裝修糾紛多?專家指點5步驟 避開陷阱不吃虧\\n\\n【大紀元2019年06月13日訊】(大...2019-06-17 19:59:11+2021年06月13日
42854294https://www.epochtimes.com/gb/19/6/13/n1132041...室内装修纠纷多?专家指点5步骤 避开陷阱不吃亏\\n\\n【大纪元2019年06月13日讯】(大...2019-06-17 19:59:11+2021年06月13日
\n", "

3678 rows × 5 columns

\n", "
" ], "text/plain": [ " id news_url \\\n", "0 2 https://www.worldjournal.com/wj/story/121362/5... \n", "1 3 https://www.chinatimes.com/newspapers/20210505... \n", "2 4 https://www.singtao.ca/4950752/2021-05-19/news... \n", "3 5 https://tw.appledaily.com/property/20210430/LE... \n", "4 6 https://news.sina.com.tw/article/20210522/3864... \n", "... ... ... \n", "4281 4290 https://www.businesstoday.com.tw/article/categ... \n", "4282 4291 https://house.udn.com/house/story/11134/4221093 \n", "4283 4292 https://www.ettoday.net/news/20190928/1543997.htm \n", "4284 4293 https://www.epochtimes.com/b5/19/6/13/n1132041... \n", "4285 4294 https://www.epochtimes.com/gb/19/6/13/n1132041... \n", "\n", " news_content news_day \\\n", "0 南加地產火熱,但建材漲,人工貴,房價順勢上揚。(記者李雪/攝影)\\n\\n新冠疫情 掀起一場風... None \n", "1 水泥為所有建設、工程主要原料,又是大宗民生物資之一,在水泥價格預期調漲下,市場指出,水泥漲價... 2021-05-05 04:10:00+ \n", "2 舉報\\n\\n疫情爆發以來,本材價格急升超過三倍。 CBC\\n\\n【星島綜合報道】北美建材價格... 2021-05-19 00:00:00 \n", "3 房地產業做為台灣經濟的火車頭,從土地、建物、建材、家具乃至設計裝潢產業等,市場皆不容小覷。2... 2021-04-30 00:00:00 \n", "4 同意 AGREE\\n\\n如果您繼續閱讀,視同您同意我們隱私條款。This website u... 2021-05-22 10:02:11+ \n", "... ... ... \n", "4281 黃之揚先前做過許多工作,卻都難以持久,怎知一碰見海洋垃圾,就成了他終身志業的所在。\\n\\n來... 2019-11-08 11:38:20+ \n", "4282 獎!獎!獎!清景麟「白易居THE ARCH」 獲三項國際設計獎\\n\\n撰文.攝影/張世雅\\n... None \n", "4283 文/時尚家居 空間設計暨圖片提供/東之光室內裝修設計\\n\\n\\n\\n《2019艾特獎 最佳辦... 2019-09-28 15:00:00 \n", "4284 室內裝修糾紛多?專家指點5步驟 避開陷阱不吃虧\\n\\n【大紀元2019年06月13日訊】(大... 2019-06-17 19:59:11+ \n", "4285 室内装修纠纷多?专家指点5步骤 避开陷阱不吃亏\\n\\n【大纪元2019年06月13日讯】(大... 2019-06-17 19:59:11+ \n", "\n", " crawler_date \n", "0 2021年05月23日 \n", "1 2021年05月23日 \n", "2 2021年05月23日 \n", "3 2021年05月23日 \n", "4 2021年05月23日 \n", "... ... \n", "4281 2021年06月13日 \n", "4282 2021年06月13日 \n", "4283 2021年06月13日 \n", "4284 2021年06月13日 \n", "4285 2021年06月13日 \n", "\n", "[3678 rows x 5 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = data.drop_duplicates(subset=['news_content'], keep='first').drop_duplicates(subset=['news_url'], keep='first')\n", "data" ] }, { "cell_type": "code", "execution_count": 14, "id": "928c2993", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Building prefix dict from the default dictionary ...\n", "Loading model from cache /tmp/jieba.cache\n", "Loading model cost 0.446 seconds.\n", "Prefix dict has been built successfully.\n" ] } ], "source": [ "import json\n", "import operator\n", "from typing import List, Tuple, Optional\n", "import os\n", "import jieba\n", "import jieba.posseg as pseg\n", "\n", "jieba.load_userdict('jieba_add_word.txt')\n", "jieba.load_userdict('jieba_add_word_kw_with_weighting.txt')\n", "\n", "\n", "check_pos_list=[]\n", "\n", "# Check if contains num\n", "def notNumStr(instr):\n", " for item in instr:\n", " if '\\u0041' <= item <= '\\u005a' or ('\\u0061' <= item <='\\u007a') or item.isdigit():\n", " return False\n", " return True\n", "\n", "# Read Target Case if Json\n", "def readSingleTestCases(testFile):\n", " with open(testFile) as json_data:\n", " try:\n", " testData = json.load(json_data)\n", " except:\n", " # This try block deals with incorrect json format that has ' instead of \"\n", " data = json_data.read().replace(\"'\",'\"')\n", " try:\n", " testData = json.loads(data)\n", " # This try block deals with empty transcript file\n", " except:\n", " return \"\"\n", " returnString = \"\"\n", " for item in testData:\n", " try:\n", " returnString += item['text']\n", " except:\n", " returnString += item['statement']\n", " return returnString\n", "\n", "class Word():\n", " def __init__(self, char, freq = 0, deg = 0):\n", " self.freq = freq\n", " self.deg = deg\n", " self.char = char\n", "\n", " def returnScore(self):\n", " return self.deg/self.freq\n", "\n", " def updateOccur(self, phraseLength):\n", " self.freq += 1\n", " self.deg += phraseLength\n", "\n", " def getChar(self):\n", " return self.char\n", "\n", " def updateFreq(self):\n", " self.freq += 1\n", "\n", " def getFreq(self):\n", " return self.freq\n", "\n", "class Rake:\n", "\n", " def __init__(self): # , stopwordPath: str = None, delimWordPath: str = None):\n", " # If both Found and Initialized\n", " self.initialized = False\n", " self.stopWordList = list()\n", " self.delimWordList = list()\n", "\n", " def initializeFromPath(self, stopwordPath: str = \"\", delimWordPath: str = \"\"):\n", " if not os.path.exists(stopwordPath):\n", " print(\"Stop Word Path invalid\")\n", " return\n", "\n", " if not os.path.exists(delimWordPath):\n", " print(\"Delim Word Path Invalid\")\n", " return\n", "\n", " \n", " swLibList = [line.rstrip('\\n') for line in converter.convert(open(stopwordPath,'r').read()).split('\\n')]\n", " conjLibList = [line.rstrip('\\n') for line in converter.convert(open(delimWordPath,'r').read()).split('\\n')]\n", " \n", " \n", " self.initializeFromList(swLibList, conjLibList)\n", " return\n", " \n", " def initializeFromList(self, swList : List = None, dwList : List = None):\n", " self.stopWordList = swList\n", " self.delimWordList = dwList\n", " \n", " if len(self.stopWordList) == 0 or len(self.delimWordList) == 0:\n", " print(\"Empty Stop word list or deliminator word list, uninitialized\")\n", " return\n", " else:\n", " self.initialized = True\n", "\n", " def extractKeywordFromPath(self, text : str, num_kw : int = 10):\n", " if not self.initialized:\n", " print(\"Not initialized\")\n", " return \n", "\n", " with open(text,'r') as fp:\n", " text = fp.read()\n", " return self.extractKeywordFromString(text, num_kw = num_kw)\n", " \n", " def extractKeywordFromString(self, text : str, num_kw : int = 10):\n", " rawtextList = pseg.cut(text)\n", "\n", " \n", " # Construct List of Phrases and Preliminary textList\n", " textList = []\n", " listofSingleWord = dict()\n", " lastWord = ''\n", "\n", " # for jieba\n", " poSPrty = ['zg','m','x','uj','ul','mq','u','v','f','t','vd','q','r','d','p','nr','r'\n", " 'c','TIME','xc','a','ad','an','nrt','df','b','vn','l','y','o','i']\n", "\n", " meaningfulCount = 0\n", " checklist = []\n", " for eachWord, flag in rawtextList:\n", " \n", " \n", " check_pos_list.append(str(eachWord)+'/'+str(flag))\n", "\n", " checklist.append([eachWord,flag])\n", " if eachWord in self.delimWordList or not notNumStr(eachWord) or eachWord in self.stopWordList or flag in poSPrty or eachWord == '\\n':\n", " if lastWord != '|':\n", " textList.append(\"|\")\n", " lastWord = \"|\"\n", " elif eachWord not in self.stopWordList and eachWord != '\\n':\n", " textList.append(eachWord)\n", " meaningfulCount += 1\n", " if eachWord not in listofSingleWord:\n", " listofSingleWord[eachWord] = Word(eachWord)\n", " lastWord = ''\n", "\n", " # Construct List of list that has phrases as wrds\n", " newList = []\n", " tempList = []\n", " for everyWord in textList:\n", " if everyWord != '|':\n", " tempList.append(everyWord)\n", " else:\n", " newList.append(tempList)\n", " tempList = []\n", "\n", " tempStr = ''\n", " for everyWord in textList:\n", " if everyWord != '|':\n", " tempStr += everyWord + '|'\n", " else:\n", " if tempStr[:-1] not in listofSingleWord:\n", " listofSingleWord[tempStr[:-1]] = Word(tempStr[:-1])\n", " tempStr = ''\n", "\n", " # Update the entire List\n", " for everyPhrase in newList:\n", " res = ''\n", " for everyWord in everyPhrase:\n", " listofSingleWord[everyWord].updateOccur(len(everyPhrase))\n", " res += everyWord + '|'\n", " phraseKey = res[:-1]\n", " if phraseKey not in listofSingleWord:\n", " listofSingleWord[phraseKey] = Word(phraseKey)\n", " else:\n", " listofSingleWord[phraseKey].updateFreq()\n", "\n", " # Get score for entire Set\n", " outputList = dict()\n", " for everyPhrase in newList:\n", "\n", " if len(everyPhrase) > 5:\n", " continue\n", " score = 0\n", " phraseString = ''\n", " outStr = ''\n", " for everyWord in everyPhrase:\n", " score += listofSingleWord[everyWord].returnScore()\n", " phraseString += everyWord + '|'\n", "\n", " \n", " outStr += everyWord\n", " phraseKey = phraseString[:-1]\n", " freq = listofSingleWord[phraseKey].getFreq()\n", "\n", " \n", " if freq / meaningfulCount < 0.05 and freq < 3 :\n", " continue\n", " outputList[outStr] = score\n", " \n", " \n", " return list(outputList.keys())" ] }, { "cell_type": "code", "execution_count": 15, "id": "cb3b8175", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['', '從而', '同時', '論', '俺們', '一', '但因為', '只要', '而且', '彰攝', '兩者', '育攝', '是', '正是', '塵頭', '如若', '基於', '地將', '遵循', '資料', '啷噹', '呵', '方所', '尺寸', 'en', '但凡', '毋寧', '之', '重置', 'few', '一則', '發現', '2', '只為', '待', '來', '乃至', 'mustn', '礦山', '價落', 'on', '來着', '打造', '按照', 'jpg', '即令', '腳酸', '樣樣', '此外', '地產行業極具', '房大', '可能', '周知', '圖片', '大面', '1', '固然', 'use', '隨着', '房明明', '再者說', '他人', '最具', '怎麼樣', 'only', '致力', '感會', '首先', 'make', '一切', '至若', '較之', '異境', '妹妹', '有時', '如上', '但以', '上下', '當然', 'should', '云爾', '戰果', '靈魂', '任憑', 'me', '產品', '假若', '編號', '空間', '在於', '息息', '影響', '屋況整理乾', '寧願', '無所畏懼', '光是', '類科', '小心', '這就是說', '筆額', '門易關', '並非', '要務', '到', '此時', '以來', '替', '啦', '能', 'video', '{', 'as', '這麼些', '猶且', '依', '要不是', 'other', 'shan', '文中', '身上', '事情', 'but', '乘', '「', '事宜', '撰文編輯', '計師', 'Courtesy', '簡言之', '別以', '地上', '一不注意', 'will', '類如', '便於', '非特', '正如', 'one', 'at', '心裁', '如何', '有些', 'were', '太苦', '如下', '人', '及', '她', '玩意', '性化', '直到', '所在', '那麼', '可見', 'is', '唯有', '今', 'he', '但其', '坪房子實際', '鑑於', '聞雲', '男子', '前者', '具優良風評', '越是', '對', '誠如', '私性', '等', 'years', '公室', '記者林和謙', '先量', '子境', '萬一', '作者', '於小宅', '舉例', '什麼樣', '繼而', '誰', '何時', '嗡嗡', '莫若', 'hadn', '不過', '意願', '運氣', '我們', 'ma', '很', '大量', '由於', '其', '身試法', 'with', '衣空間', '房會', '歸齊', '刀口', 'isn', '內所', '文章曝光後', '用心', '這一來', '或則', '建議', '中正', \"haven't\", '產品時', '直指', '地', '與此同時', '師溝', '豈但', '若是', '騙人', '借', 'do', '們', '諸', '紀錄', '產資訊平台', '這麼樣', '而外', '彼此', 'she', '”', '嚇', '事項', '據', \"wouldn't\", '仔細比', '甚至', '成房貸', '一樣', 'herself', '繼後', '住房地租', 'weren', '記錄', '魔法', '但', '惟其', '難題', '有形', '其一', 'which', '以免', '!', '截至', '台灣', '目前', '無人', '連同', '巴巴', '一般', '除了', 'when', '前此', '經過', '$', '業人員', 'that', 'didn', '總而言之', '這樣', '人居', '但怕視', '道理', '此次', '金代', '不合理', '在下', '嘍', '用法', '矣乎', '的確', '這邊', 'for', '誰人', '設使', 'same', '兒', '那會兒', '能力', 'no', '細心', '建案總', '部份', '諸如', '若夫', '向着', '設若', '盡', '事物', 'are', '別', '介於', '諸位', 'way', 'did', '遵照', \"doesn't\", 'nor', '常會', '主力', '另悉', '之所', '臨', '哉', '獎入', '以', '儘管', '衣機', '設或', \"shan't\", '又及', '拿', '甚至於', \"hadn't\", '能容', '怎麼', '爲止', '哪天', '誤會', '意義', 'we', '差距', '非但', '嗚', '看', '公司名稱', \"shouldn't\", '離', \"weren't\", 'until', '風水上', '倘使', \"she's\", '邊', '根據', 'shouldn', '致', '能溝', '各種', '他們', '每', '會比', '數範圍', '哪邊', '帝國', 'was', '嘻', '處在', '咳', '不如', '自打', '然後', '並說', '花錢', '此間', '譬如', '常態', '自個兒', ',', '規業者', '即如', '只怕', '比如', 'below', '一旦', '怎', '喂', '故此', '也好', '記者', '不惟', '這時', '熱議', '烏乎', 'through', '從此', '中路', '之所以', '由此可見', '最', '般的', '於新穎', '師盧', '接着', '矣哉', \"won't\", '邊機', '甚而', '爲了', '式廚房', '本身', '省地', '第一', '開始', '只限', '國中美術課', 'yours', '傻傻的', '不外乎', '寧可', '戚戚焉', '_', 'Trophy', '哈哈', '各自', '記者許凱彰攝', '先裁', '則', '間界線', 're', 'own', '誰料', 'by', '奇葩', '而為', '本站', '質與量', '無寧', '它們', '嘿', '其它', '難道說', '本心', '或過', '保值性因機能', '結果', '果真', '僵持不下', '能點', \"isn't\", 'who', '\\n', '於是乎', '(。', '天生', '母法', 'here', '由是', '幽靈', 'there', '總的來看', '不妨', '後', '極了', '而言', '呼哧', '然而', '獲得', '反過來說', '時會', '歟', '使', '才', '手筆', '不盡', '各位', '報導', '版主', '這些', '風傳媒節', '按', '所', '內', '六大', '我', '喲', '企業代', '加以', '奇奇怪怪', '不肖', '台北報導', 'than', '使得', '》', 'themselves', '又', 'of', 'yourselves', '重重的', '打', '衝', '爲此', 'projects', '盡有', '所幸', '本', '屋會', '別處', '老實', '身心俱疲', '因坪數小', 'image', 'down', '云云', '距', '再者', '、', 'just', 'i', '哦', '一些', '讓', 'new', '不只', '無法', '因素', 'or', '要是', '叫', '數算', '時候', '啐', '絕佳', '因為房價', '樓半', \"mightn't\", '自後', '意念', '回家', '只', '各', '屋裡', '本人', '並以', '事交', '2019', '方式', '將房門關', '多麼', '不容', '不光', '客買盤', '以故', 'name', '跟', '非徒', '呀', '沿', '名稱', '東販', '賊死', '除開', '則甚', 'ours', '什麼', '誠然', '人群', '旺好', '單者', '要不', 'under', '巴', '人點', '正派', '咚', '譁', '心中', '他', \"hasn't\", '靠山', 'won', '對於', '唉', '因為床', '不特', 'into', '大', '。', '據此', '人性所需', '再其次', 't', '寬境', '就是', '之類', 'category', '比及', '嗡', '嘎', '等等', '哪些', '呢', '既往', '哼', '因了', '望', 'and', '內政部部務會報', '至今', \"you'd\", '雖', '且說', '這個', '人家', '與', '區內湖路', '獵心', '哎呀', 'ourselves', '字頭', \"couldn't\", '億元', '些', '筆錢', '此', '不然', '一個', '將客', '即便', '小包', '雖然', ':', '人員', '內心', '房前', '管', '起亞', '不怕', 'hers', '以為', 'any', '二是', '沿着', '上會', '何處', '所有', '於是', '形式交屋', '區域分', '順着', '即或', '如', '難事', '縱使', '不論是', '今年', '作品參賽', '靜靜的', '屋魔鬼系解', '哥哥', '先求', '6', '區處', '哎喲', '其次', 'after', '高大', '其中', '關於', 'haven', '如同', '出來', '一格', '買幾', '或', '林喬慧', '省得', '幸福', '同', '甜頭', '哪兒', '悠然', '記者嚴鈺雯', '再說', '並', '定義', '矣', '記者張菱育攝', 'my', 'off', 'all', '或是', '就是說', '雲司', '來定義', '記者蘇瑜', '嗬', '分別', '高度喔', '者', '以期', '那樣', '爾後', 'whom', '俺', '此處', 'during', '原本', '不盡然', '不成', '公分', '且', '新高', '果然', '這', '甚或', '針對', 'your', 'their', 'them', '開外', '目光', '精彩', '相對而言', '標題', '當', '那麼樣', '除此之外', '記者林裕豐', 'not', 'itself', 'each', '煥然', 'this', '記者許凱彰', '別說', '數介', '人字', '的話', '記者戴鈺純', '精力', '麼', '賴以', '股份', '不得', '己', '\\n\\n', '界權威', '記者蘇', '%', '隨', '嗯', '因此', '房族', \"aren't\", '因人', '師建議', '由此', '一物', 'himself', '表示', '被', '既', '而後', '當地', '趕', '哪年', '自', '大門尿尿', '那', '不是', '例如', '本地', '那邊', '大台', '沖馬', '從', '有及', '爲', 'some', '房則', '怪物', '兼之', '不拘', 'having', '近百', '得', '客才', '只不過', '此地', 'its', '庶乎', '問題', '你', '即若', '屋人房貸', '對方', '該', 'theirs', '咱', '何況', 'while', 'has', '是的', '但是', '在', \"should've\", '成日', '指出', '照着', \"didn't\", '儻然', '大大的', 'you', '替代', '隨後', '總的說來', '器應裝', '字頭熱門宅', '可區', '儘管如此', '四大', '然則', '身邊朋友', '.', '但要', '着', '坪格局', '仍', 'couldn', '爾爾', '人熱心', '易度', '其他人', '還', '公司', '數產品', '層樓華', '因', 'our', '專門', '孰料', '性差', '樣子', '依舍', '若果 ', '或者', '人會', '網吐槽', '本着', '?', '提供', 's', '精神', 'against', '雖以', '一來', '比方', 'if', '庶幾', '費超', '维境', '性比', '哎', '費越', 'very', '與其說', '何以', '各個', '咧', '騰', '也', '哪怕', 'd', '怎麼辦', '可是', '那個', '時格局', '數動輒', '慘狀', '濕機', '起', 'the', 'what', '3', '阿', '手邊', '情況', '前後', '已矣', 'out', '靠', '對待', '坪上', '人士', '以便', '都', '漫說', '或其', '感覺', '正巧', '它', 'his', '哪', '因着', '身分', '受到', '前務', '縱令', 'to', '倘然', '綜上所述', '著文具', '反觀', '力度', 'aren', \"mustn't\", '屬量', '好', \"you'll\", '不比', '另外', 'where', '也罷', 'wouldn', '意趣', '有', '只當', '拉大', '土生土長', 'does', '全部', '以及', '和平', '記者呂詠', '頻軍', '說來', '叮咚', '只消', '吱', 'because', '格格不入', '飛彈', '某', '內文', '例子', '能爭', '作品', '力道', '以銀行匯款', '通過', '正值', '但若買家', '莫不然', '《', 'ain', '假使', 'how', '容易', '2020', '上會感覺', '以上', '緊接着', '許多', '而是', '由', '照度', ' ', '發展', '既是', '的', '小小的', 'doing', '所國', '哪個', '朝着', '或以', '具體地說', '二來', 'in', '可及', '並且', '爲什麼', '項缺點', '鄙人', '如是', '舊率', '房改', '乎', '慢說', '既有', 'more', 'an', '不論', '焉', '識度', '了', '全世界', '數約', '有所', '至於', '因為', 'so', '需求', '本質', '誰知', '凡是', '太大', '建議民眾', '去年', 'those', '呸', '賞心悅目', 'time', '}', 'doesn', '腦筋', '別的', '工班溝', '而已', '呃', '如其', 'him', '逐步', '竟而', '式更衣室', '甚且', '屋民眾', '彼時', '縱', 'says', '即使', '不獨', '亦', '上百', '4', '加之', '歸', '因而', '可說', '那般', '隨時', '乾脆', '過類', '寧', '越小越', '越大越', '筆買房', '且不說', '-', \"needn't\", '哇', '這次', '這會兒', 'be', '可', '海陸', '無論', 'further', '啊', '幾', '但若', '如上所述', '着呢', 'o', '」', '東西', '每當', '體上', '那兒', '爲着', 'm', '居者', '嘿嘿', '記者黃可昀', '與否', '科技大學財務金融系副教授', '換句話說', '形家具', ')', '商願意', '連', 'about', '反而', 'also', 'once', '中文版', '部分', '因應', '但僅', '內能', '哪裏', '不若', '咦', '換言之', '若以', '打從', '吧噠', '以此', '喏', '會身', '天下', 've', '若非', '若', 'before', '凡', '怎奈', '似的', '飾性', '曾耳聞', '當着', '意境', '心思', '人礙', '尚且', '至', '而', '嘛', '寧肯', '猶自', '給', '這位網友文中', 'over', '經由', \"you're\", '有質', '大腿', 'hasn', 'space', '將光', '數小', '不但', '透過', '情景', '別人', '假如', '和', '經', '人建議', 'now', '某某', '以至於', '自身', '傳媒', '雞毛', '成果', 'between', 'most', '自家', '這麼', '縱然', 'next', '那麼些', 'a', '照', '太多', '區分', '非獨', '自各兒', '嗚呼', '那些', '其二', 'myself', '再', '買房時', '動人', '卻', 'from', '是以', '江怡慧', '旁人', '需拉明線', '心目', '?', '手中', '師能依消費者習慣', 'am', '屋人', '所以', '除', \"you've\", '將', '先行', '哩', '不翼而飛', '要略', 'up', '不至於', '價是', '以至', '爲何', '人化', '不問', '往', '爾', '繼之', '比創', '依照', '罷了', '第', '只是', '雖則', ')。', '家行', '仍舊', '個別', '還要', '再有', '冒', '這般', '泥代', '出於', '這裏', '厚積', '辦法', '幹嘛', '世人', '套衛浴', '端的', '比', '一方面', '笨蛋', '不卡', '趁', '喔唷', '能否', \"that'll\", '三房實質', '泡泡大件衣物', '等到', 'too', '自從', '筆者', '憑藉', '個', '報報', 'it', '先不先', '貴公', '(', '否則', 'well', '烤雞', '別是', '產生', '成川建商新案時', '進而', '對比', '那裏', '啪達', '人能', '若為', '法親', '並同時', '你們', '還是', '林明', '主要', '文曝光', '來說', '反過來', '另', '有的', '才能', '始而', '於坪數', '已', 'then', '式空間', '理事', '萬元', '故而', '那時', '因爲', 'both', '情勢', '後者', '不管', '順', '餘外', '依據', '咱們', '但小坪', '方為', 'er', '投機客會', 'wasn', '多', '以爲', '雙方', '0', 'inline', '像', '奔騰瀟', \"it's\", '全體', '東南亞', '向使', '要', '作爲', '之一', '數字', 'don', '總的來說', '不為人知', '沒奈何', '中散', '先裝潢', '倘或', '因為人', '或曰', '網友坦言', '加泥共', '會落', '故', '塞進', '吧', '套共用', '一空', '可以', '原因', '幾時', '10', '心態', '其他', 'best', '來自', '感興趣', 'these', '嘔', '不', '別管', '數比', '最佳', 'needn', '具體說來', '而有', '再則', '除非', \"don't\", \"wasn't\", '起見', 'y', '此為', '乃', '無', '於', '況且', '倘若', 'been', '去', '以致', '知域', '既然', '就算', '某個', '整體感覺', '很漂亮', '任', '會準', '何', '啥', '抑或', '一何', '譬喻', '就', '新房子手上', '網友熱議', '哈', '及其', '5', 'yourself', '向', '總之', '莫如', '就是了', '趁着', '利於', '使用', '兮', '呵呵', '“', '樓視野', '用光', '某些', '古有明', '憑', '自己', '曾', '下', '倘', '上', '答案', '多少', '以外', '恰恰相反', '實力由此', '不單', 'above', '任何', '下場', 'again', '與其', '路上', '心情', '哪樣', '7', 'why', '到哪去', '有限', '噓', '只有', '嗎', '較', '把', '9', '競相', '除外', 'being', 'can', 'like', '/', '天上', '如此', 'have', '眼見', '眼睛', '幻想篇章', 'such', '有關', '人們', '及至', '大家', '就要', '若要', '終於', '房太', 'see', '依法', '音因', '總是', '因感', '售屋時', 'had', '朝', '如果', '過', '不僅', '持續', '孰知', '充份', ';', '其餘', '有限公司', '會花', '價王', 'year', '人心', '造型天花', '題發文', 'mightn', '用', 'they', '雖說', '連帶', '一轉眼', '\\n\\n◎', '要麼', '房後', '另一方面', '她們', '大公', '氣憤', '賓客目光', '乃至於', '不料', '無虞', '哼唷', '關卡入', '眨眼', 'her', '性需求', '藏金', '8', '怎樣', '文本', '用來', '四大原因', '而況', '反之', 'll', '得了', '甚麼', '領教', '要不然', '小化', '數業者', '這麼點兒', '唄', '這兒', '即', '咋', '您', '小', '內容', '噯', '寓所', '還有', '記者陳韋帆攝', '彼', '步入', '人買房', '嘎登', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \"you're\", \"you've\", \"you'll\", \"you'd\", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', \"she's\", 'her', 'hers', 'herself', 'it', \"it's\", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', \"that'll\", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', \"don't\", 'should', \"should've\", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', \"aren't\", 'couldn', \"couldn't\", 'didn', \"didn't\", 'doesn', \"doesn't\", 'hadn', \"hadn't\", 'hasn', \"hasn't\", 'haven', \"haven't\", 'isn', \"isn't\", 'ma', 'mightn', \"mightn't\", 'mustn', \"mustn't\", 'needn', \"needn't\", 'shan', \"shan't\", 'shouldn', \"shouldn't\", 'wasn', \"wasn't\", 'weren', \"weren't\", 'won', \"won't\", 'wouldn', \"wouldn't\", '$', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '?', '_', '“', '”', '、', '。', '《', '》', '一', '一些', '一何', '一切', '一則', '一方面', '一旦', '一來', '一樣', '一般', '一轉眼', '萬一', '上', '上下', '下', '不', '不僅', '不但', '不光', '不單', '不只', '不外乎', '不如', '不妨', '不盡', '不盡然', '不得', '不怕', '不惟', '不成', '不拘', '不料', '不是', '不比', '不然', '不特', '不獨', '不管', '不至於', '不若', '不論', '不過', '不問', '與', '與其', '與其說', '與否', '與此同時', '且', '且不說', '且說', '兩者', '個', '個別', '臨', '爲', '爲了', '爲什麼', '爲何', '爲止', '爲此', '爲着', '乃', '乃至', '乃至於', '麼', '之', '之一', '之所以', '之類', '烏乎', '乎', '乘', '也', '也好', '也罷', '了', '二來', '於', '於是', '於是乎', '云云', '云爾', '些', '亦', '人', '人們', '人家', '什麼', '什麼樣', '今', '介於', '仍', '仍舊', '從', '從此', '從而', '他', '他人', '他們', '以', '以上', '以爲', '以便', '以免', '以及', '以故', '以期', '以來', '以至', '以至於', '以致', '們', '任', '任何', '任憑', '似的', '但', '但凡', '但是', '何', '何以', '何況', '何處', '何時', '餘外', '作爲', '你', '你們', '使', '使得', '例如', '依', '依據', '依照', '便於', '俺', '俺們', '倘', '倘使', '倘或', '倘然', '倘若', '借', '假使', '假如', '假若', '儻然', '像', '兒', '先不先', '光是', '全體', '全部', '兮', '關於', '其', '其一', '其中', '其二', '其他', '其餘', '其它', '其次', '具體地說', '具體說來', '兼之', '內', '再', '再其次', '再則', '再有', '再者', '再者說', '再說', '冒', '衝', '況且', '幾', '幾時', '凡', '凡是', '憑', '憑藉', '出於', '出來', '分別', '則', '則甚', '別', '別人', '別處', '別是', '別的', '別管', '別說', '到', '前後', '前此', '前者', '加之', '加以', '即', '即令', '即使', '即便', '即如', '即或', '即若', '卻', '去', '又', '又及', '及', '及其', '及至', '反之', '反而', '反過來', '反過來說', '受到', '另', '另一方面', '另外', '另悉', '只', '只當', '只怕', '只是', '只有', '只消', '只要', '只限', '叫', '叮咚', '可', '可以', '可是', '可見', '各', '各個', '各位', '各種', '各自', '同', '同時', '後', '後者', '向', '向使', '向着', '嚇', '嗎', '否則', '吧', '吧噠', '吱', '呀', '呃', '嘔', '唄', '嗚', '嗚呼', '呢', '呵', '呵呵', '呸', '呼哧', '咋', '和', '咚', '咦', '咧', '咱', '咱們', '咳', '哇', '哈', '哈哈', '哉', '哎', '哎呀', '哎喲', '譁', '喲', '哦', '哩', '哪', '哪個', '哪些', '哪兒', '哪天', '哪年', '哪怕', '哪樣', '哪邊', '哪裏', '哼', '哼唷', '唉', '唯有', '啊', '啐', '啥', '啦', '啪達', '啷噹', '喂', '喏', '喔唷', '嘍', '嗡', '嗡嗡', '嗬', '嗯', '噯', '嘎', '嘎登', '噓', '嘛', '嘻', '嘿', '嘿嘿', '因', '因爲', '因了', '因此', '因着', '因而', '固然', '在', '在下', '在於', '地', '基於', '處在', '多', '多麼', '多少', '大', '大家', '她', '她們', '好', '如', '如上', '如上所述', '如下', '如何', '如其', '如同', '如是', '如果', '如此', '如若', '始而', '孰料', '孰知', '寧', '寧可', '寧願', '寧肯', '它', '它們', '對', '對於', '對待', '對方', '對比', '將', '小', '爾', '爾後', '爾爾', '尚且', '就', '就是', '就是了', '就是說', '就算', '就要', '盡', '儘管', '儘管如此', '豈但', '己', '已', '已矣', '巴', '巴巴', '並', '並且', '並非', '庶乎', '庶幾', '開外', '開始', '歸', '歸齊', '當', '當地', '當然', '當着', '彼', '彼時', '彼此', '往', '待', '很', '得', '得了', '怎', '怎麼', '怎麼辦', '怎麼樣', '怎奈', '怎樣', '總之', '總的來看', '總的來說', '總的說來', '總而言之', '恰恰相反', '您', '惟其', '慢說', '我', '我們', '或', '或則', '或是', '或曰', '或者', '截至', '所', '所以', '所在', '所幸', '所有', '才', '才能', '打', '打從', '把', '抑或', '拿', '按', '按照', '換句話說', '換言之', '據', '據此', '接着', '故', '故此', '故而', '旁人', '無', '無寧', '無論', '既', '既往', '既是', '既然', '時候', '是', '是以', '是的', '曾', '替', '替代', '最', '有', '有些', '有關', '有及', '有時', '有的', '望', '朝', '朝着', '本', '本人', '本地', '本着', '本身', '來', '來着', '來自', '來說', '極了', '果然', '果真', '某', '某個', '某些', '某某', '根據', '歟', '正值', '正如', '正巧', '正是', '此', '此地', '此處', '此外', '此時', '此次', '此間', '毋寧', '每', '每當', '比', '比及', '比如', '比方', '沒奈何', '沿', '沿着', '漫說', '焉', '然則', '然後', '然而', '照', '照着', '猶且', '猶自', '甚且', '甚麼', '甚或', '甚而', '甚至', '甚至於', '用', '用來', '由', '由於', '由是', '由此', '由此可見', '的', '的確', '的話', '直到', '相對而言', '省得', '看', '眨眼', '着', '着呢', '矣', '矣乎', '矣哉', '離', '竟而', '第', '等', '等到', '等等', '簡言之', '管', '類如', '緊接着', '縱', '縱令', '縱使', '縱然', '經', '經過', '結果', '給', '繼之', '繼後', '繼而', '綜上所述', '罷了', '者', '而', '而且', '而況', '而後', '而外', '而已', '而是', '而言', '能', '能否', '騰', '自', '自個兒', '自從', '自各兒', '自後', '自家', '自己', '自打', '自身', '至', '至於', '至今', '至若', '致', '般的', '若', '若夫', '若是', '若果 ', '若非', '莫不然', '莫如', '莫若', '雖', '雖則', '雖然', '雖說', '被', '要', '要不', '要不是', '要不然', '要麼', '要是', '譬喻', '譬如', '讓', '許多', '論', '設使', '設或', '設若', '誠如', '誠然', '該', '說來', '諸', '諸位', '諸如', '誰', '誰人', '誰料', '誰知', '賊死', '賴以', '趕', '起', '起見', '趁', '趁着', '越是', '距', '跟', '較', '較之', '邊', '過', '還', '還是', '還有', '還要', '這', '這一來', '這個', '這麼', '這麼些', '這麼樣', '這麼點兒', '這些', '這會兒', '這兒', '這就是說', '這時', '這樣', '這次', '這般', '這邊', '這裏', '進而', '連', '連同', '逐步', '通過', '遵循', '遵照', '那', '那個', '那麼', '那麼些', '那麼樣', '那些', '那會兒', '那兒', '那時', '那樣', '那般', '那邊', '那裏', '都', '鄙人', '鑑於', '針對', '阿', '除', '除了', '除外', '除開', '除此之外', '除非', '隨', '隨後', '隨時', '隨着', '難道說', '非但', '非徒', '非特', '非獨', '靠', '順', '順着', '首先', '!', ',', ':', ';', '?', '']\n", "CPU times: user 281 ms, sys: 17.4 ms, total: 298 ms\n", "Wall time: 297 ms\n" ] } ], "source": [ "%%time\n", "#透過Rake_For_Chinese將所有文章的關鍵字提取出來\n", "\n", "import random\n", "\n", "from nltk.corpus import stopwords\n", "import time\n", "import opencc\n", "\n", "converter = opencc.OpenCC('s2t.json')\n", "converter.convert('汉字') # 漢字\n", "cc_stopwords = converter.convert(open(\"cn_stopwords.txt\", \"r\").read()).split('\\n')\n", "\n", "\n", "\n", "obj = Rake()\n", "stop_path = \"./stoplist/中文停用词表(1208个).txt\"\n", "conj_path = \"./stoplist/中文分隔词词库.txt\"\n", "obj.initializeFromPath(stop_path, conj_path)\n", "\n", "\n", "\n", "# 廷用詞列表\n", "with open('customized_stopwords.pickle', 'rb') as handle:\n", " customized_stopwords = pickle.load(handle)\n", "customized_stopwords.extend(stopwords.words('english'))\n", "customized_stopwords.extend(cc_stopwords)\n", "\n", "print(customized_stopwords)" ] }, { "cell_type": "code", "execution_count": 16, "id": "4461d197", "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "【Transformer】Documents embedding ... DONE!\n", "embeddings.shape: (3678, 512)\n", "CPU times: user 1min 3s, sys: 3.35 s, total: 1min 6s\n", "Wall time: 17.9 s\n" ] } ], "source": [ "%%time\n", "from sentence_transformers import SentenceTransformer, util\n", "from transformers import AutoTokenizer, AutoModel, BertTokenizerFast\n", "import torch\n", "import umap\n", "from more_itertools import ichunked\n", "\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "\n", "\n", "# documents embedding\n", "print('【Transformer】Documents embedding ... ',end='')\n", "model = SentenceTransformer('distiluse-base-multilingual-cased-v1')\n", "embeddings = model.encode(data['news_content'].tolist())\n", "print('DONE!')\n", "print('embeddings.shape:',embeddings.shape)" ] }, { "cell_type": "code", "execution_count": 17, "id": "b4f7c1cb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "【HDBSCAN】Clustering ...DONE!\n", " -------------------------------------------------------------------------------- \n", "Noise ratio: 90.3 % 3321 / 3678\n", "... 裡下手?想改造居家空間、嘗試裝潢,與其找室內設計師,也許自己來比較划算?租屋或和長輩同住,不方便改動房子格局?或只想改動部分區域?如果你有這些考量,那 「軟裝潢」(又稱軟裝、軟裝修) 就是很好的選擇。新購屋、租屋族都瘋!軟裝潢為何受歡迎?幸福空間軟裝部視覺設計總監陳曉雲形容,室內設計師就像整型醫生,負責空間規畫、動線格局與整體風格營造等工作,以新成屋來看,每坪 3 萬~10 萬元不等;而 「軟裝師」就像造型師,為房子塗油漆、增添家具家飾,每坪含家具與設計費用約 1 萬元上下,改造期只要 3~5 天 ,且能分房進行、改造後復原度高,受租屋、新購屋與老屋改造族群青睞。乍聽之下,挑家具、塗油漆這些工 ...\n", " --------------------------------------------------------------------------------\n", "Predict_doc (target domain) is predicted to be in cluster # -1 (noise)\n", " -> Replace with the largest cluster # 31\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 17.7 s, sys: 259 ms, total: 18 s\n", "Wall time: 17.9 s\n" ] } ], "source": [ "%%time\n", "import hdbscan\n", "import matplotlib.pyplot as plt\n", "from collections import Counter\n", "from itertools import compress\n", "\n", "from collections import Counter\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import matplotlib.colors as mcolors\n", "from matplotlib.pyplot import figure\n", "\n", "\n", "# HDBSCAN clustering\n", "print('【HDBSCAN】Clustering ...',end='')\n", "hclusterer = hdbscan.HDBSCAN(prediction_data=True).fit(embeddings) #embeddings_list\n", "print('DONE!\\n','-'*80,'\\nNoise ratio:',round(list(hclusterer.labels_).count(-1) / len(embeddings),3)*100,'% ',list(hclusterer.labels_).count(-1),'/',len(embeddings))\n", "\n", "\n", "# approximate predict cluster to find target domain\n", "# you can customize what you want to predict\n", "predict_doc = sample(data[data['news_content'].str.contains('幸福空間')]['news_content'].tolist(), 1)[0].replace('\\n','')\n", "print('...',predict_doc[50:350], '...\\n','-'*80)\n", "test_labels, strengths = hdbscan.approximate_predict(hclusterer, model.encode([predict_doc]))\n", "target_domain_cluster = test_labels[0]\n", "print('Predict_doc (target domain) is predicted to be in cluster #',target_domain_cluster,end='')\n", "if target_domain_cluster == -1:\n", " temp = Counter(hclusterer.labels_)\n", " del temp[-1]\n", " target_domain_cluster = max(temp.items(), key=operator.itemgetter(1))[0]\n", " print(' (noise)\\n -> Replace with the largest cluster #',target_domain_cluster)\n", "\n", "\n", "\n", "\n", "fig, ax = plt.subplots(figsize=(40, 5), dpi=72)\n", "labels, values = zip(*sorted(Counter(hclusterer.labels_[hclusterer.labels_!=-1]).items()))\n", "indexes = np.arange(len(labels))\n", "width = 0.9\n", "bars = ax.bar(indexes, values, width, alpha=0.2, color = [[color for name, color in mcolors.TABLEAU_COLORS.items()][i%2] for i in sorted(Counter(hclusterer.labels_[hclusterer.labels_!=-1]))])\n", "plt.xticks(indexes, labels)\n", "ax.bar_label(bars, fmt='%d', padding=-14)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 18, "id": "e40ae708", "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "num of news in the cluster # 31 : 77\n", "news in the cluster # 31 :\n", "\n", "... 市寸土寸金,不只租屋族被迫蟻居,現在竟然連新建案都出現不到15坪的兩房。一名網友表示,最近在觀望北市的兩房建案,原本對坪數的要求最少要15坪,結果越看越絕望,很多12、13坪的兩房也貴桑桑,最誇張的是9坪也硬隔成兩房。\n", "\n", "原PO在PTT表示,他看了很多兩房新建案都15坪以下,甚至8、9坪的小套房硬要隔成兩房,居住空間被壓縮到極限,坪數稍大的兩房價格卻又讓人無法親近,「有些好不容易到15、16坪的要上到近3000萬」,很好奇現在台北人對於坪數的底線在哪,難道願意住在15坪都不到的兩房嗎?\n", "\n", "▲小宅是台北市普遍現象。(示意圖/記者張菱育攝)\n", "\n", "\n", "\n", "網友指出,北市仍有很多大坪數的兩房,但總價遠遠超乎 ...\n", " --------------------------------------------------------------------------------\n", "... 年重劃區的發展日益茁壯,也有很多人會選擇重劃區做為看房買房的首選,最廣為受到討論的就有好幾個,包括桃園青埔、新店央北、板橋江翠北、新莊副都心以及頭前重劃區。有人在社團中問到,自己的預算約1200萬,想在頭前重劃區買不附停車位小坪數2房,不少網友紛紛回應原PO,這價格選擇很少,就算有,也會被秒殺。\n", "\n", "原PO在新莊地方社團問網友是否有推薦的房屋物件、推薦建案,預算1200萬想在頭前重劃區買小坪數2房,並說到不需要車位,問了仲介都說這個價位沒有物件可以選擇。\n", "\n", "▲新莊頭前重劃區一直備受討論。(圖/記者許凱彰攝)\n", "\n", "底下網友回應提到,「附車位比較保值,如果不使用可以收租金」、「上個月有釋出(符合需求物 ...\n", " --------------------------------------------------------------------------------\n", "... 讓售屋民眾猶豫的一大問題。近日有網友在PTT發文詢問空屋和裝潢屋,哪個比較好賣。若在房子出售前先裝潢過,是否真的會較容易賣出或影響出售行情。對此,專家表示,這點因人而異。\n", "許多網友表示是否會變比較好賣要看裝潢品味決定,「有些裝潢很個人風,你買到又要拆」、「看你裝潢後要加多少錢,和你國中美術課有沒有翹課」、「櫃子做太多的裝潢很不好」、「請不要亂用顏色,請只用黑白灰」、「裝潢最悲劇就是色系亂搭,穩死;其次才是質材亂搭,死的沒那麼慘。但是只要色系是和諧的,質材不要跳太大,那隨便搭都不會有甚麼問題」。\n", "?台北千萬退休宅怎麼選?曝優劣分析\n", "?雙北買氣最旺捷運站出爐!「他」第一!\n", "? 首購要買哪?萬名網友 ...\n", " --------------------------------------------------------------------------------\n", "... 不再那麼迫切。而目前養老服務體系建立的更加健全,年輕人更願意自己居住,而不是和父母輩住在一起。加上思想的轉變,頂客、不婚族的數量也日益龐大,因此小房子也將愈來愈搶手。專家表示,「小坪數房型」是當前的時勢所趨,未來這類型的房子不僅受歡迎,保值性更是無庸置疑。\n", "根據內政部資料顯示,雙北房市第二季交易坪數占比中,20~35坪(土地+建物)占36.9%,35~50坪占20.7%,新北市房市,20~35坪(土地+建物)占43.5%,35~50坪占23.5%,可見小坪數物件仍占市場大宗。因此專家分析,「小坪數房型」儼然已成為現今必備的產品與趨勢,而其中也存在一定優勢,才能穩住市場,吸引買家入手。\n", "?台北千 ...\n", " --------------------------------------------------------------------------------\n", "... 但你知道嗎?裝潢是件超級麻煩的事,下手前還要思考這樣的設計是否實用以及耐用程度夠不夠持久等問題。所以房屋裝潢當然不能馬虎,功課做足,才不會花了一大筆裝潢費後才來後悔。\n", "日前有網友在《Mobile01》發問,因為自己花了很多錢在裝潢上,但是時間久了才覺得根本不實用,而且也發現有些材質根本不耐用,當初的想法不夠周到,所以藉此想詢問大家「什麼裝潢最沒用?」,文章一PO釣出網友們分享自身裝潢經驗,也有人點出關鍵「很多裝潢設計到最後很難維持,像我因為每天下班回家都累得要死,根本不可能打掃,但有些裝潢死角很多加上材質問題,外表看起來是很漂亮,但如果沒有按時清潔就會卡一層厚厚灰塵顯得非常髒。」\n", "㊙想買保值宅 ...\n", " --------------------------------------------------------------------------------\n", "... 其現在年輕人對於家中的裝潢設計非常講究,就算空間小也要布置到讓人賞心悅目,裝潢設計師盧淑媛建議,在預算有限的狀況下,也可以把裝潢效果做到最好,只要記得「六大省錢招術」就可以讓你花小錢大變身,居家質感UP!\n", "一、減少天花板工程\n", "?台北千萬退休宅怎麼選?曝優劣分析\n", "?雙北買氣最旺捷運站出爐!「他」第一!\n", "? 首購要買哪?萬名網友狂推「這區」\n", "?1字頭熱門宅都在這!2021恐成絶響\n", "天花板包覆工程需要一筆費用,這筆錢也不容小覷,做天花板得動用到木工、油漆,不僅費用高,施工時間也長。近幾年因工業風的裝潢風格盛行,若想省錢也可以不要額外做天花板,將管線重新粉刷成自己喜歡的顏色,再搭配軌道燈設計,也可以變 ...\n", " --------------------------------------------------------------------------------\n", "... 因此近幾年盛行小坪數的「小宅」,總價較低讓許多年輕首購族趨之若鶩,加上現今少子化、孩子生得少,小宅產品更夯。但有些建案對小宅的設計上並不是很理想,包括空間規劃、動線、格局…都讓人看了搖頭;台灣建築中心執行長許世杰表示,建商沒有以「人居」思維做考量。\n", "房價高衍生出現今這樣的「小宅」趨勢,加上少子化現象,許多建商都推小坪數產品,總價較低,民眾較能負擔得起;坪數大約20坪至30坪左右,有些物件甚至不到20坪,但扣掉公設後,剩下的實際室內面積真的不大,且有些建案的格局與動線設計並不是太優,消費者仍須注意。\n", "㊙想買保值宅?買房選「這」最抗跌!\n", "?雙北買氣最旺捷運站出爐!「他」第一!\n", "? 首購要買哪?萬名 ...\n", " --------------------------------------------------------------------------------\n", "... 代,小坪數成為普通家庭首選。但是,一些小坪數的屋主,他們在裝修設計上因為坪數小,會受到更多的限制,喜歡的設計方案無法實施,這是非常苦惱的。眾所周知,門在裝修中屬於非常重要的一個部分,可是小坪數對於房門的選擇也有很多限制,最常見的推門浪費空間,滑門裝不好噪音不斷。\n", "\n", "對此,有裝修設計業內人士推薦,如果要避掉以上煩惱,那麼,折疊門將是小坪數房屋很好的選擇,可以一秒解放空間,「折疊門不僅省地,用起來還很方便,不會出現很多磕磕碰碰地的問題。」\n", "\n", "他指出,首先,折疊門外觀上比推拉門更加美觀,且裝飾風格比較獨特,「感覺既像傳統的屏風,又像現代的透視門窗,給家居設計帶來非常好的裝飾效果,讓人有種眼前一亮的視 ...\n", " --------------------------------------------------------------------------------\n", "... 卻有網友在PTT上討論「對透天厝越蓋越小真的非常有感」。\n", "往往民眾選擇透天物件,就是希望享有大坪數及寬闊感,不料網友點出近來在台中各地看房,才發現新建大樓、公寓坪數被壓縮很小,變成小三房或是小二房格局,然而這樣的景象甚至也擴散影響了透天厝市場。\n", "?首購必看!買房「隱性成本」明細大公開!\n", "?雙北買氣最旺捷運站出爐!「他」第一!\n", "? 首購要買哪?萬名網友狂推「這區」\n", "?1字頭熱門宅都在這!2021恐成絶響\n", "民眾點出消費者銀彈不足,讓部分透天厝出現「小宅化」問題。示意圖/取自photoAC\n", "原PO表示,新的透天厝蓋得很小,原本面寬可以停雙車位,現在變成只能停一輛車,大約400多公分,連前後長度也被明 ...\n", " --------------------------------------------------------------------------------\n", "... 出「小宅」物件,盼以低總價等優勢來吸引民眾目光。網路上也有人感嘆,現在新房的空間越來越小,擔心住起來不舒適。\n", "一名網友就在PTT上發問,老家跟自家都至少3房2廳25坪以上,對四口家庭來說勉強過得去,但現在新房越來越小,在大台北地區一般受薪買得起的房子,對應的室內坪數頂多20坪,甚至15~18坪左右。由於自己沒住過類似大小的房子,而目前居住人數1~2人,盼住在這坪數範圍的人能分享心得。\n", "㊙想買保值宅?買房選「這」最抗跌!\n", "?雙北買氣最旺捷運站出爐!「他」第一!\n", "? 首購要買哪?萬名網友狂推「這區」\n", "?1字頭熱門宅都在這!2021恐成絶響\n", "網友請教小坪數過來人的居住心得。示意圖/取自photoAC ...\n", " --------------------------------------------------------------------------------\n", "CPU times: user 2.47 ms, sys: 0 ns, total: 2.47 ms\n", "Wall time: 1.33 ms\n" ] } ], "source": [ "%%time\n", "\n", "from tqdm import tqdm\n", "from random import sample\n", "\n", "\n", "# check news in the target domain cluster or other cluster\n", "cluster_num = target_domain_cluster\n", "fil = [l==cluster_num for l in hclusterer.labels_]\n", "doc_list = list(compress(data['news_content'].tolist(), fil))\n", "\n", "print('num of news in the cluster #',cluster_num,':', len(doc_list))\n", "print('news in the cluster #',cluster_num,':\\n')\n", "for d in sample(doc_list,min(10,len(doc_list))):\n", " print('...',d[50:350], '...\\n','-'*80)" ] }, { "cell_type": "code", "execution_count": 19, "id": "18b801b7", "metadata": {}, "outputs": [], "source": [ "def find_tags(doc_list, customized_stopwords = customized_stopwords):\n", " tag_list=[]\n", " pbar = tqdm(range(len(doc_list)))\n", " pbar.set_description(\"[Extracting keywords...]\")\n", "\n", " fail_count = 0\n", "\n", " for d in doc_list:\n", " pbar.update()\n", "\n", " try:\n", " result = obj.extractKeywordFromString(d) #, num_kw=6\n", " except:\n", " result = 'a'\n", " fail_count +=1\n", " print('-'*80)\n", " print('Keywords of this news are not available:\\n','...',d[50:250], '...\\n','-'*80)\n", "\n", "\n", " tags = list(filter(lambda x: len(x)>1 and x not in list(set(customized_stopwords)), result))\n", " tag_list.extend(tags)\n", " tag_list = list(set(tag_list))\n", "\n", " pbar.close()\n", " \n", " print('Num of keywords:', len(tag_list))\n", " print('Fail:', fail_count,'\\n')\n", " \n", " return tag_list" ] }, { "cell_type": "code", "execution_count": 20, "id": "3d384c2d", "metadata": { "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[Extracting keywords...]: 0%| | 0/10 [00:00