parsers.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. from bs4 import BeautifulSoup
  2. import logging
  3. logger = logging.getLogger(__name__)
  4. def get_section_parser(section_class_name):
  5. return SECTION_PARSER.get(section_class_name, SectionParser)
  6. class SectionParser():
  7. def __init__(self, update_data):
  8. self.update_data = update_data
  9. self.b_order, self.p_order, self.img_order, self.a_order = 0, 0, 0, 0
  10. self.card_text_order, self.title_mb_text_order, self.yt_video_order = 0, 0, 0
  11. self.is_amp_img, self.is_amp_youtube = False, False
  12. def _get_spaces(self, text):
  13. count = 0
  14. for i in text:
  15. if i == '<':
  16. break
  17. count += 1
  18. return (count - 1) * ' '
  19. def update(self, content, text):
  20. try:
  21. if '<b' in text:
  22. soup = BeautifulSoup(text, "html.parser")
  23. soup.b.string = self.update_data.get('b', [''])[self.b_order]
  24. text = self._get_spaces(text) + str(soup)
  25. self.b_order += 1
  26. elif '<p' in text:
  27. soup = BeautifulSoup(text, "html.parser")
  28. soup.p.string = self.update_data.get('p', [''])[self.p_order]
  29. text = self._get_spaces(text) + str(soup)
  30. self.p_order += 1
  31. elif '<a ' in text and '</a>' in text:
  32. soup = BeautifulSoup(text, "html.parser")
  33. soup.a.string = self.update_data.get('a', [''])[self.a_order]
  34. text = self._get_spaces(text) + str(soup)
  35. self.a_order += 1
  36. elif 'card-text' in text:
  37. soup = BeautifulSoup(text, "html.parser")
  38. soup.div.string = self.update_data.get('card_text', [''])[self.card_text_order]
  39. text = self._get_spaces(text) + str(soup)
  40. self.card_text_order += 1
  41. elif 'title mb' in text:
  42. soup = BeautifulSoup(text, "html.parser")
  43. soup.div.string = self.update_data.get('title_mb_text', [''])[self.card_text_order]
  44. text = self._get_spaces(text) + str(soup)
  45. self.title_mb_text_order += 1
  46. elif '<amp-img' in text:
  47. self.is_amp_img = True
  48. elif '<amp-youtube' in text:
  49. self.is_amp_youtube = True
  50. if self.is_amp_img:
  51. if 'src=' in text:
  52. text_list = text.split('src=')
  53. text_list[-1] = 'src="{}"\n'.format(self.update_data.get(
  54. 'img', [{}])[self.img_order].get('src', ''))
  55. text = ''.join(text_list)
  56. if '</amp-img>' in text:
  57. self.is_amp_img = True
  58. self.img_order += 1
  59. elif self.is_amp_youtube:
  60. if 'data-videoid' in text:
  61. text_list = text.split('data-videoid=')
  62. text_list[-1] = 'data-videoid="{}"\n'.format(self.update_data.get(
  63. 'yt_video', [{}])[self.yt_video_order].get('videoid', ''))
  64. text = ''.join(text_list)
  65. if '</amp-youtube>' in text:
  66. self.is_amp_youtube = True
  67. self.yt_video_order += 1
  68. except Exception as err:
  69. logger.error('section parser failed with {}'.format(err))
  70. finally:
  71. content += text
  72. return content
  73. class Section18SectionParser(SectionParser):
  74. def __init__(self, update_data):
  75. super().__init__(update_data)
  76. self.is_pure_div = False
  77. self.is_pure_div_order, self.mb_5_order = 0, 0
  78. def _update_div_data(self, text, update_key, order):
  79. soup = BeautifulSoup(text, "html.parser")
  80. soup.div.string = self.update_data.get(update_key, [''])[order]
  81. return self._get_spaces(text) + str(soup)
  82. def update(self, content, text):
  83. try:
  84. if 'title mb' in text:
  85. text = self._update_div_data(text, 'title_mb_text', self.card_text_order)
  86. self.title_mb_text_order += 1
  87. if self.title_mb_text_order == 2:
  88. self.is_pure_div = True
  89. elif self.is_pure_div:
  90. text = self._update_div_data(text, 'pure_div_text', self.is_pure_div_order)
  91. self.is_pure_div_order += 1
  92. self.is_pure_div = False
  93. elif 'mb-5' in text:
  94. text = self._update_div_data(text, 'mb_5_text', self.mb_5_order)
  95. self.mb_5_order += 1
  96. elif '<a ' in text:
  97. soup = BeautifulSoup(text, "html.parser")
  98. soup.a.string = self.update_data.get('a', [''])[self.a_order]
  99. text = self._get_spaces(text) + str(soup)
  100. self.a_order += 1
  101. except Exception as err:
  102. logger.error('section parser failed with {}'.format(err))
  103. finally:
  104. content += text
  105. return content
  106. SECTION_PARSER = {'section18': Section18SectionParser}