|
@@ -0,0 +1,119 @@
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+import logging
|
|
|
+
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
+
|
|
|
+
|
|
|
+def get_section_parser(section_class_name):
|
|
|
+ return SECTION_PARSER.get(section_class_name, SectionParser)
|
|
|
+
|
|
|
+
|
|
|
+class SectionParser():
|
|
|
+ def __init__(self, update_data):
|
|
|
+ self.update_data = update_data
|
|
|
+ self.b_order, self.p_order, self.img_order, self.a_order = 0, 0, 0, 0
|
|
|
+ self.card_text_order, self.title_mb_text_order, self.yt_video_order = 0, 0, 0
|
|
|
+ self.is_amp_img, self.is_amp_youtube = False, False
|
|
|
+
|
|
|
+ def _get_spaces(self, text):
|
|
|
+ count = 0
|
|
|
+ for i in text:
|
|
|
+ if i == '<':
|
|
|
+ break
|
|
|
+ count += 1
|
|
|
+ return (count - 1) * ' '
|
|
|
+
|
|
|
+ def update(self, content, text):
|
|
|
+ try:
|
|
|
+ if '<b' in text:
|
|
|
+ soup = BeautifulSoup(text, "html.parser")
|
|
|
+ soup.b.string = self.update_data.get('b', [''])[self.b_order]
|
|
|
+ text = self._get_spaces(text) + str(soup)
|
|
|
+ self.b_order += 1
|
|
|
+ elif '<p' in text:
|
|
|
+ soup = BeautifulSoup(text, "html.parser")
|
|
|
+ soup.p.string = self.update_data.get('p', [''])[self.p_order]
|
|
|
+ text = self._get_spaces(text) + str(soup)
|
|
|
+ self.p_order += 1
|
|
|
+ elif '<a ' in text:
|
|
|
+ soup = BeautifulSoup(text, "html.parser")
|
|
|
+ soup.a.string = self.update_data.get('a', [''])[self.a_order]
|
|
|
+ text = self._get_spaces(text) + str(soup)
|
|
|
+ self.a_order += 1
|
|
|
+ elif 'card-text' in text:
|
|
|
+ soup = BeautifulSoup(text, "html.parser")
|
|
|
+ soup.div.string = self.update_data.get('card_text', [''])[self.card_text_order]
|
|
|
+ text = self._get_spaces(text) + str(soup)
|
|
|
+ self.card_text_order += 1
|
|
|
+ elif 'title mb' in text:
|
|
|
+ soup = BeautifulSoup(text, "html.parser")
|
|
|
+ soup.div.string = self.update_data.get('title_mb_text', [''])[self.card_text_order]
|
|
|
+ text = self._get_spaces(text) + str(soup)
|
|
|
+ self.title_mb_text_order += 1
|
|
|
+ elif '<amp-img' in text:
|
|
|
+ self.is_amp_img = True
|
|
|
+ elif '<amp-youtube' in text:
|
|
|
+ self.is_amp_youtube = True
|
|
|
+ if self.is_amp_img:
|
|
|
+ if 'src=' in text:
|
|
|
+ text_list = text.split('src=')
|
|
|
+ text_list[-1] = 'src="{}"\n'.format(self.update_data.get(
|
|
|
+ 'img', [{}])[self.img_order].get('src', ''))
|
|
|
+ text = ''.join(text_list)
|
|
|
+ if '</amp-img>' in text:
|
|
|
+ self.is_amp_img = True
|
|
|
+ self.img_order += 1
|
|
|
+ elif self.is_amp_youtube:
|
|
|
+ if 'data-videoid' in text:
|
|
|
+ text_list = text.split('data-videoid=')
|
|
|
+ text_list[-1] = 'data-videoid="{}"\n'.format(self.update_data.get(
|
|
|
+ 'yt_video', [{}])[self.yt_video_order].get('videoid', ''))
|
|
|
+ text = ''.join(text_list)
|
|
|
+ if '</amp-youtube>' in text:
|
|
|
+ self.is_amp_youtube = True
|
|
|
+ self.yt_video_order += 1
|
|
|
+ except Exception as err:
|
|
|
+ logger.error('section parser failed with {}'.format(err))
|
|
|
+ finally:
|
|
|
+ content += text
|
|
|
+ return content
|
|
|
+
|
|
|
+
|
|
|
+class Section18SectionParser(SectionParser):
|
|
|
+ def __init__(self, update_data):
|
|
|
+ super().__init__(update_data)
|
|
|
+ self.is_pure_div = False
|
|
|
+ self.is_pure_div_order, self.mb_5_order = 0, 0
|
|
|
+
|
|
|
+ def _update_div_data(self, text, update_key, order):
|
|
|
+ soup = BeautifulSoup(text, "html.parser")
|
|
|
+ soup.div.string = self.update_data.get(update_key, [''])[order]
|
|
|
+ return self._get_spaces(text) + str(soup)
|
|
|
+
|
|
|
+ def update(self, content, text):
|
|
|
+ try:
|
|
|
+ if 'title mb' in text:
|
|
|
+ text = self._update_div_data(text, 'title_mb_text', self.card_text_order)
|
|
|
+ self.title_mb_text_order += 1
|
|
|
+ if self.title_mb_text_order == 2:
|
|
|
+ self.is_pure_div = True
|
|
|
+ elif self.is_pure_div:
|
|
|
+ text = self._update_div_data(text, 'pure_div_text', self.is_pure_div_order)
|
|
|
+ self.is_pure_div_order += 1
|
|
|
+ self.is_pure_div = False
|
|
|
+ elif 'mb-5' in text:
|
|
|
+ text = self._update_div_data(text, 'mb_5_text', self.mb_5_order)
|
|
|
+ self.mb_5_order += 1
|
|
|
+ elif '<a ' in text:
|
|
|
+ soup = BeautifulSoup(text, "html.parser")
|
|
|
+ soup.a.string = self.update_data.get('a', [''])[self.a_order]
|
|
|
+ text = self._get_spaces(text) + str(soup)
|
|
|
+ self.a_order += 1
|
|
|
+ except Exception as err:
|
|
|
+ logger.error('section parser failed with {}'.format(err))
|
|
|
+ finally:
|
|
|
+ content += text
|
|
|
+ return content
|
|
|
+
|
|
|
+
|
|
|
+SECTION_PARSER = {'section18': Section18SectionParser}
|