from bs4 import BeautifulSoup import logging logger = logging.getLogger(__name__) def get_section_parser(section_class_name): return SECTION_PARSER.get(section_class_name, SectionParser) class SectionParser(): def __init__(self, update_data): self.update_data = update_data self.b_order, self.p_order, self.img_order, self.a_order = 0, 0, 0, 0 self.card_text_order, self.title_mb_text_order, self.yt_video_order = 0, 0, 0 self.is_amp_img, self.is_amp_youtube = False, False def _get_spaces(self, text): count = 0 for i in text: if i == '<': break count += 1 return (count - 1) * ' ' def update(self, content, text): try: if '' in text: soup = BeautifulSoup(text, "html.parser") soup.a.string = self.update_data.get('a', [''])[self.a_order] text = self._get_spaces(text) + str(soup) self.a_order += 1 elif 'card-text' in text: soup = BeautifulSoup(text, "html.parser") soup.div.string = self.update_data.get('card_text', [''])[self.card_text_order] text = self._get_spaces(text) + str(soup) self.card_text_order += 1 elif 'title mb' in text: soup = BeautifulSoup(text, "html.parser") soup.div.string = self.update_data.get('title_mb_text', [''])[self.card_text_order] text = self._get_spaces(text) + str(soup) self.title_mb_text_order += 1 elif '' in text: self.is_amp_img = True self.img_order += 1 elif self.is_amp_youtube: if 'data-videoid' in text: text_list = text.split('data-videoid=') text_list[-1] = 'data-videoid="{}"\n'.format(self.update_data.get( 'yt_video', [{}])[self.yt_video_order].get('videoid', '')) text = ''.join(text_list) if '' in text: self.is_amp_youtube = True self.yt_video_order += 1 except Exception as err: logger.error('section parser failed with {}'.format(err)) finally: content += text return content class Section18SectionParser(SectionParser): def __init__(self, update_data): super().__init__(update_data) self.is_pure_div = False self.is_pure_div_order, self.mb_5_order = 0, 0 def _update_div_data(self, text, update_key, order): soup = BeautifulSoup(text, "html.parser") soup.div.string = self.update_data.get(update_key, [''])[order] return self._get_spaces(text) + str(soup) def update(self, content, text): try: if 'title mb' in text: text = self._update_div_data(text, 'title_mb_text', self.card_text_order) self.title_mb_text_order += 1 if self.title_mb_text_order == 2: self.is_pure_div = True elif self.is_pure_div: text = self._update_div_data(text, 'pure_div_text', self.is_pure_div_order) self.is_pure_div_order += 1 self.is_pure_div = False elif 'mb-5' in text: text = self._update_div_data(text, 'mb_5_text', self.mb_5_order) self.mb_5_order += 1 elif '