123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119 |
- from bs4 import BeautifulSoup
- import logging
- logger = logging.getLogger(__name__)
- def get_section_parser(section_class_name):
- return SECTION_PARSER.get(section_class_name, SectionParser)
- class SectionParser():
- def __init__(self, update_data):
- self.update_data = update_data
- self.b_order, self.p_order, self.img_order, self.a_order = 0, 0, 0, 0
- self.card_text_order, self.title_mb_text_order, self.yt_video_order = 0, 0, 0
- self.is_amp_img, self.is_amp_youtube = False, False
- def _get_spaces(self, text):
- count = 0
- for i in text:
- if i == '<':
- break
- count += 1
- return (count - 1) * ' '
- def update(self, content, text):
- try:
- if '<b' in text:
- soup = BeautifulSoup(text, "html.parser")
- soup.b.string = self.update_data.get('b', [''])[self.b_order]
- text = self._get_spaces(text) + str(soup)
- self.b_order += 1
- elif '<p' in text:
- soup = BeautifulSoup(text, "html.parser")
- soup.p.string = self.update_data.get('p', [''])[self.p_order]
- text = self._get_spaces(text) + str(soup)
- self.p_order += 1
- elif '<a ' in text and '</a>' in text:
- soup = BeautifulSoup(text, "html.parser")
- soup.a.string = self.update_data.get('a', [''])[self.a_order]
- text = self._get_spaces(text) + str(soup)
- self.a_order += 1
- elif 'card-text' in text:
- soup = BeautifulSoup(text, "html.parser")
- soup.div.string = self.update_data.get('card_text', [''])[self.card_text_order]
- text = self._get_spaces(text) + str(soup)
- self.card_text_order += 1
- elif 'title mb' in text:
- soup = BeautifulSoup(text, "html.parser")
- soup.div.string = self.update_data.get('title_mb_text', [''])[self.card_text_order]
- text = self._get_spaces(text) + str(soup)
- self.title_mb_text_order += 1
- elif '<amp-img' in text:
- self.is_amp_img = True
- elif '<amp-youtube' in text:
- self.is_amp_youtube = True
- if self.is_amp_img:
- if 'src=' in text:
- text_list = text.split('src=')
- text_list[-1] = 'src="{}"\n'.format(self.update_data.get(
- 'img', [{}])[self.img_order].get('src', ''))
- text = ''.join(text_list)
- if '</amp-img>' in text:
- self.is_amp_img = True
- self.img_order += 1
- elif self.is_amp_youtube:
- if 'data-videoid' in text:
- text_list = text.split('data-videoid=')
- text_list[-1] = 'data-videoid="{}"\n'.format(self.update_data.get(
- 'yt_video', [{}])[self.yt_video_order].get('videoid', ''))
- text = ''.join(text_list)
- if '</amp-youtube>' in text:
- self.is_amp_youtube = True
- self.yt_video_order += 1
- except Exception as err:
- logger.error('section parser failed with {}'.format(err))
- finally:
- content += text
- return content
- class Section18SectionParser(SectionParser):
- def __init__(self, update_data):
- super().__init__(update_data)
- self.is_pure_div = False
- self.is_pure_div_order, self.mb_5_order = 0, 0
- def _update_div_data(self, text, update_key, order):
- soup = BeautifulSoup(text, "html.parser")
- soup.div.string = self.update_data.get(update_key, [''])[order]
- return self._get_spaces(text) + str(soup)
- def update(self, content, text):
- try:
- if 'title mb' in text:
- text = self._update_div_data(text, 'title_mb_text', self.card_text_order)
- self.title_mb_text_order += 1
- if self.title_mb_text_order == 2:
- self.is_pure_div = True
- elif self.is_pure_div:
- text = self._update_div_data(text, 'pure_div_text', self.is_pure_div_order)
- self.is_pure_div_order += 1
- self.is_pure_div = False
- elif 'mb-5' in text:
- text = self._update_div_data(text, 'mb_5_text', self.mb_5_order)
- self.mb_5_order += 1
- elif '<a ' in text:
- soup = BeautifulSoup(text, "html.parser")
- soup.a.string = self.update_data.get('a', [''])[self.a_order]
- text = self._get_spaces(text) + str(soup)
- self.a_order += 1
- except Exception as err:
- logger.error('section parser failed with {}'.format(err))
- finally:
- content += text
- return content
- SECTION_PARSER = {'section18': Section18SectionParser}
|