shop_item_crawler.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399
  1. # -*- coding: utf-8 -*-
  2. from seleniumwire import webdriver
  3. from selenium.webdriver.common.action_chains import ActionChains
  4. from selenium.webdriver.common.keys import Keys
  5. from selenium.webdriver.support import expected_conditions as EC
  6. from selenium.webdriver.support.wait import WebDriverWait
  7. from selenium.webdriver.common.by import By
  8. import selenium
  9. import traceback
  10. from bs4 import BeautifulSoup
  11. from utility import database_access as DA
  12. from utility.parseutils import *
  13. from utility.connect import *
  14. from datetime import datetime
  15. import pandas as pd
  16. import dataset
  17. import requests, random, time, json
  18. import re, sys, os
  19. import socket, brotli
  20. from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
  21. import urllib.parse
  22. chrome_window=False
  23. #chrome_window=True
  24. globalkw=None
  25. proxyport=8787
  26. def build_cache(db):
  27. id_dict={}
  28. cursor = db.query('SELECT place_id FROM {}.{};'.format(MYSQL_CONFIG['MYSQL_DB'], TABLE_STORE_LIST))
  29. for c in cursor:
  30. id_dict[c['place_id']]=1
  31. return id_dict
  32. def brower_start(port):
  33. global proxyport
  34. global chrome_window
  35. print(proxyport)
  36. options = webdriver.ChromeOptions()
  37. if chrome_window:
  38. browser = webdriver.Chrome(
  39. desired_capabilities=options.to_capabilities()
  40. )
  41. else:
  42. chrome_options = webdriver.ChromeOptions()
  43. chrome_options.add_argument('--proxy-server=host.docker.internal:'+str(proxyport)) # Specify your Kubernetes service-name here
  44. chrome_options.add_argument('--ignore-certificate-errors')
  45. chrome_options.add_argument("--no-sandbox")
  46. chrome_options.add_argument("--disable-dev-shm-usage")
  47. browser = webdriver.Remote(
  48. command_executor='http://127.0.0.1:'+str(port)+'/wd/hub',
  49. desired_capabilities=chrome_options.to_capabilities(),
  50. seleniumwire_options={'addr':'0.0.0.0','port':proxyport,'auto_config': False}
  51. )
  52. # seleniumwire_options = {'addr': '172.17.0.2','port':4444})
  53. browser.set_window_size(1400,1000)
  54. return browser
  55. def keyin_keyword(driver, keyword):
  56. button = driver.find_element_by_id("searchbox")
  57. driver.implicitly_wait(30)
  58. ActionChains(driver).move_to_element(button).send_keys(keyword).send_keys(Keys.RETURN).perform()
  59. time.sleep(3)
  60. def scan_job(db, kw):
  61. result = {'kw' : kw}
  62. table_name = '{}.{}'.format(MYSQL_CONFIG['MYSQL_DB'],MYSQL_CONFIG['TABLE_CONV_LOG'])
  63. cursor = db.query('select t1.num,next-prev as diff from {} t1, \
  64. (SELECT num,max(id) mid FROM {} group by num ) t2 \
  65. where t1.id=t2.mid having diff>0 order by rand()'.format(table_name, table_name))
  66. for c in cursor:
  67. result['num']=c['num']
  68. break
  69. cursor = db.query('select lat,lon,loc from {} where num ="'.format(TABLE_LAT_LON)+str(result['num'])+'"')
  70. for c in cursor:
  71. result['lat'] = c['lat']
  72. result['lon'] = c['lon']
  73. result['loc'] = c['loc']
  74. return result
  75. def get_next_job(db, repeat=False, repkw=None, repnum=None):
  76. global globalkw
  77. result={}
  78. cursor = db.query('select kw, num from {} where expand = 0 order by rand()'.format(TABLE_AREACODES))
  79. for c in cursor:
  80. if repkw is None:
  81. repkw = c['kw']
  82. result['kw'] = c['kw']
  83. result['num'] = c['num']
  84. break
  85. if repkw is not None:
  86. result['kw'] = repkw
  87. if result.get('num') is not None:
  88. cursor = db.query('select lat,lon,loc from {} where num ="{}"'.format(TABLE_LAT_LON, str(result['num'])))
  89. for c in cursor:
  90. result['lat']=c['lat']
  91. result['lon']=c['lon']
  92. result['loc']=c['loc']
  93. break
  94. if repeat and repkw!= 'REP':
  95. result['kw']=repkw
  96. result['num']=repnum
  97. if 'REP' in repkw:
  98. if repnum=='REP':
  99. repnum=None
  100. cursor = db.query('select num from {} order by rand() limit 1'.format(TABLE_STORE_LIST))
  101. for c in cursor:
  102. repnum=c['num']
  103. break
  104. if repnum is None:
  105. cursor = db.query('select num from {} order by rand() limit 1'.format(TABLE_STORE_LIST))
  106. for c in cursor:
  107. repnum=c['num']
  108. break
  109. cursor = db.query('select lat_txt,lon_txt,keyword,num from {} where num="{}" limit 1'.format(TABLE_STORE_LIST, str(repnum)))
  110. for c in cursor:
  111. result['kw']=c['keyword']
  112. result['num']=c['num']
  113. result['lat']=c['lat_txt']
  114. result['lon']=c['lon_txt']
  115. result['loc']=''
  116. return result
  117. if repeat:
  118. cursor = db.query('select lat_txt,lon_txt,keyword from {} order by rand() limit 1'.format(TABLE_STORE_LIST))
  119. for c in cursor:
  120. result['kw']=c['keyword']
  121. result['lat']=c['lat_txt']
  122. result['lon']=c['lon_txt']
  123. return result
  124. def write_to_file(jsobj,fname):
  125. import codecs
  126. fw=codecs.open(fname,'w','utf-8')
  127. fw.write(str(jsobj))
  128. fw.close()
  129. def parsing_js(orig):
  130. resultobj=[]
  131. content=""
  132. lines=orig.split('\n')
  133. for l in lines:
  134. newl=l.replace('\\"','"')
  135. newl=newl.replace('\\"','"')
  136. content+=newl
  137. result=re.search(r'\[\["',content)
  138. print(result)
  139. content_begin=result.start()
  140. result=re.search(r'\]\]"',content)
  141. print(result)
  142. content_end=result.end()
  143. jscontent=content[content_begin:content_end-1]
  144. # write_to_file(jscontent,'c:/tmp/debug.txt')
  145. jsobj=json.loads(jscontent)
  146. for x in jsobj[0][1][1:]:
  147. print(x[14][11])
  148. print(x[14][9])
  149. reviews_cnt=None
  150. photo=None
  151. rating=None
  152. biz_id=None
  153. loc_x=None
  154. loc_y=None
  155. addr_elmts=None
  156. tel=None
  157. try:
  158. rating=x[14][4][7]
  159. reviews_cnt=x[14][4][8]
  160. except:
  161. traceback.print_exc()
  162. try:
  163. photo=x[14][37][0][0][0]
  164. num_photos=x[14][37][0][0][6][1]
  165. except:
  166. traceback.print_exc()
  167. try:
  168. loc_x=x[14][37][0][0][29][0]
  169. loc_y=x[14][37][0][0][29][1]
  170. except:
  171. traceback.print_exc()
  172. try:
  173. biz_id=x[14][57][2]
  174. tel=x[14][178][0][3]
  175. except:
  176. traceback.print_exc()
  177. try:
  178. addr_elmts=str(x[14][82])
  179. except:
  180. traceback.print_exc()
  181. category=str(x[14][13])
  182. topic=str(x[14][89])
  183. print(x[14][13])
  184. print(x[14][10])
  185. print(x[14][2])
  186. print(x[14][78])
  187. try:
  188. resultobj.append({'name':x[14][11],'fid':x[14][10],'addr':x[14][2][0],'addr_elmts':addr_elmts,'place_id':x[14][78],'category':category,'rating':rating,'reviews_cnt':reviews_cnt,'lat':x[14][9][2],'lat_txt':str(x[14][9][2]),'lon':x[14][9][3],'lon_txt':str(x[14][9][3]),'topic':topic,'photo':photo,'num_photos':num_photos,'loc_x':loc_x,'loc_y':loc_y,'biz_id':biz_id,'tel':tel,'crawler_date':datetime.today().strftime("%Y/%m/%d %H:%M")})
  189. except:
  190. traceback.print_exc()
  191. return resultobj
  192. def save_js_to_db(jsobj,num,keyword):
  193. global store_list_table
  194. global iddict
  195. for r in jsobj:
  196. if iddict.get(r['place_id']) is not None:
  197. continue
  198. r['num']=num
  199. r['keyword']=keyword
  200. try:
  201. store_list_table.insert(r)
  202. except:
  203. traceback.print_exc()
  204. def process_web_request(db, driver, area_num, keyword):
  205. global prev_cnt
  206. request_url = None
  207. time.sleep(0.8)
  208. time.sleep(3)
  209. print("ppppppppp&**********************")
  210. for request in driver.requests:
  211. if 'search?' in request.url :
  212. print('searching.....')
  213. if request.response:
  214. if 'search?' in request.url :
  215. print('parsing js:')
  216. print(request.url)
  217. resp = brotli.decompress(request.response.body)
  218. jstext = resp.decode('utf-8')
  219. resultobj = parsing_js(jstext)
  220. print("before",datetime.now())
  221. print("num: "+str(area_num))
  222. save_js_to_db(resultobj, area_num, keyword)
  223. print("after",datetime.now())
  224. aft_cnt=0
  225. cursor = db.query('select count(*) as cnt from {} where num="{}" '.format(TABLE_STORE_LIST, str(area_num)))
  226. for c in cursor:
  227. aft_cnt=c['cnt']
  228. break
  229. db[TABLE_CONV_LOG].insert({'num':area_num,'prev':prev_cnt,'next':aft_cnt,'dt':datetime.now()})
  230. del driver.requests
  231. def check_area_code(db, kw):
  232. if kw:
  233. table_name = '{}.{}'.format(MYSQL_CONFIG['MYSQL_DB'], TABLE_AREACODES)
  234. result = db.query('select distinct(kw) from {}'.format(table_name))
  235. result = [i['kw'] for i in result]
  236. if kw not in result:
  237. try:
  238. sql = 'insert into {} (select num,"{}" as kw, 0 as expand from {}) '.format(table_name, kw, TABLE_LAT_LON)
  239. db.query(sql)
  240. except:
  241. traceback.print_exc()
  242. def page_down_(driver, time_):
  243. try:
  244. # action = webdriver.ActionChains(driver)
  245. # element = driver.find_element_by_css_selector('a[aria-label="清除搜尋"]')
  246. # print(element)
  247. # height = element.size['height']
  248. # width = element.size['width']
  249. # action.move_to_element(element).move_by_offset(-width, height).click().perform()
  250. action = webdriver.ActionChains(driver)
  251. element = driver.find_element_by_css_selector('div[class="TFQHme"]')
  252. action.move_to_element(element).click().perform()
  253. time.sleep(1)
  254. driver.back()
  255. time.sleep(1)
  256. for i in range(time_):
  257. print(i)
  258. actions = ActionChains(driver)
  259. actions.send_keys(Keys.END).perform()
  260. time.sleep(0.5)
  261. except:
  262. traceback.print_exc()
  263. def main():
  264. global chrome_window
  265. global store_list_table
  266. global globalkw
  267. global proxyport
  268. global iddict
  269. global prev_cnt
  270. port=4447
  271. if len(sys.argv)>1:
  272. globalkw=sys.argv[1]
  273. port=int(sys.argv[2])
  274. proxyport=int(sys.argv[3])
  275. print(globalkw, port, proxyport)
  276. failcnt=0
  277. localip=socket.gethostbyname(socket.gethostname())
  278. db = dataset.connect('mysql://{}:{}@{}/{}?charset=utf8mb4'.format( MYSQL_CONFIG['MYSQL_USER'],
  279. MYSQL_CONFIG['MYSQL_PASSWORD'], MYSQL_CONFIG['MYSQL_HOST'], MYSQL_CONFIG['MYSQL_DB']))
  280. store_list_table = db[TABLE_STORE_LIST]
  281. table2 = db[TABLE_PROGRESS_LIST]
  282. if not chrome_window:
  283. print('restart docker pw{}'.format(port))
  284. os.system('sudo docker container restart pw'+str(port))
  285. # os.system('docker container restart p'+str(port))
  286. time.sleep(10)
  287. print('drvier start...')
  288. driver = brower_start(port)
  289. check_area_code(db, globalkw)
  290. for i in range(368):
  291. area_num=None
  292. # if len(sys.argv) > 4 :
  293. # repkw = sys.argv[1]
  294. # repnum = sys.argv[2]
  295. # if 'SCAN' in repkw:
  296. # job = scan_job(db, repnum)
  297. # else:
  298. # job = get_next_job(db, repeat=True, repkw=repkw, repnum=repnum)
  299. # else:
  300. job = get_next_job(db, repkw=globalkw)
  301. print(job)
  302. keyword = job['kw']
  303. globalkw = keyword
  304. latitude = job['lat'] #緯度
  305. longitude = job['lon'] #精度
  306. area_num = job['num']
  307. safe_string = urllib.parse.quote_plus(keyword)
  308. for j in range(5):
  309. iddict = build_cache(db)
  310. if j != 0:
  311. latitude_ = float(latitude) + (random.randint(-999,999) / 10000)
  312. longitude_ = float(longitude) + (random.randint(-999,999) / 10000)
  313. else:
  314. latitude_, longitude_ = latitude, longitude
  315. url = 'https://www.google.com.tw/maps/@{},{},15z?hl=zh-TW'.format(latitude_, longitude_)
  316. print(url)
  317. prev_cnt=0
  318. cursor = db.query('select count(*) as cnt from {} where num="{}" '.format(TABLE_STORE_LIST, str(area_num)))
  319. for c in cursor:
  320. prev_cnt = c['cnt']
  321. break
  322. driver.get(url)
  323. time.sleep(2)
  324. keyin_keyword(driver, keyword)
  325. # page_down_(driver, 3)
  326. process_web_request(db, driver, area_num, keyword)
  327. time.sleep(1)
  328. table2.insert({'kw':keyword,'num':job['num']},['kw'])
  329. db.query(f'update {TABLE_AREACODES} set expand = 1 where num="'+str(job['num'])+'" and kw="'+keyword+'" ')
  330. if __name__ == '__main__':
  331. main()