Browse Source

add parsers and validators into utils

weichen 3 years ago
parent
commit
127a840188
3 changed files with 206 additions and 0 deletions
  1. 9 0
      models/utils/__init__.py
  2. 119 0
      models/utils/parsers.py
  3. 78 0
      models/utils/validators.py

+ 9 - 0
models/utils/__init__.py

@@ -0,0 +1,9 @@
+def write_md(f_dir, content):
+    with open(f_dir, 'w') as md:
+        md.write(content)
+
+
+def read_line_md(f_dir):
+    with open(f_dir, 'r') as md:
+        pre_content = md.readlines()
+    return pre_content

+ 119 - 0
models/utils/parsers.py

@@ -0,0 +1,119 @@
+from bs4 import BeautifulSoup
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def get_section_parser(section_class_name):
+    return SECTION_PARSER.get(section_class_name, SectionParser)
+
+
+class SectionParser():
+    def __init__(self, update_data):
+        self.update_data = update_data
+        self.b_order, self.p_order, self.img_order, self.a_order = 0, 0, 0, 0
+        self.card_text_order, self.title_mb_text_order, self.yt_video_order = 0, 0, 0
+        self.is_amp_img, self.is_amp_youtube = False, False
+
+    def _get_spaces(self, text):
+        count = 0
+        for i in text:
+            if i == '<':
+                break
+            count += 1
+        return (count - 1) * ' '
+
+    def update(self, content, text):
+        try:
+            if '<b' in text:
+                soup = BeautifulSoup(text, "html.parser")
+                soup.b.string = self.update_data.get('b', [''])[self.b_order]
+                text = self._get_spaces(text) + str(soup)
+                self.b_order += 1
+            elif '<p' in text:
+                soup = BeautifulSoup(text, "html.parser")
+                soup.p.string = self.update_data.get('p', [''])[self.p_order]
+                text = self._get_spaces(text) + str(soup)
+                self.p_order += 1
+            elif '<a ' in text:
+                soup = BeautifulSoup(text, "html.parser")
+                soup.a.string = self.update_data.get('a', [''])[self.a_order]
+                text = self._get_spaces(text) + str(soup)
+                self.a_order += 1
+            elif 'card-text' in text:
+                soup = BeautifulSoup(text, "html.parser")
+                soup.div.string = self.update_data.get('card_text', [''])[self.card_text_order]
+                text = self._get_spaces(text) + str(soup)
+                self.card_text_order += 1
+            elif 'title mb' in text:
+                soup = BeautifulSoup(text, "html.parser")
+                soup.div.string = self.update_data.get('title_mb_text', [''])[self.card_text_order]
+                text = self._get_spaces(text) + str(soup)
+                self.title_mb_text_order += 1
+            elif '<amp-img' in text:
+                self.is_amp_img = True
+            elif '<amp-youtube' in text:
+                self.is_amp_youtube = True
+            if self.is_amp_img:
+                if 'src=' in text:
+                    text_list = text.split('src=')
+                    text_list[-1] = 'src="{}"\n'.format(self.update_data.get(
+                        'img', [{}])[self.img_order].get('src', ''))
+                    text = ''.join(text_list)
+                if '</amp-img>' in text:
+                    self.is_amp_img = True
+                    self.img_order += 1
+            elif self.is_amp_youtube:
+                if 'data-videoid' in text:
+                    text_list = text.split('data-videoid=')
+                    text_list[-1] = 'data-videoid="{}"\n'.format(self.update_data.get(
+                        'yt_video', [{}])[self.yt_video_order].get('videoid', ''))
+                    text = ''.join(text_list)
+                if '</amp-youtube>' in text:
+                    self.is_amp_youtube = True
+                    self.yt_video_order += 1
+        except Exception as err:
+            logger.error('section parser failed with {}'.format(err))
+        finally:
+            content += text
+        return content
+
+
+class Section18SectionParser(SectionParser):
+    def __init__(self, update_data):
+        super().__init__(update_data)
+        self.is_pure_div = False
+        self.is_pure_div_order, self.mb_5_order = 0, 0
+
+    def _update_div_data(self, text, update_key, order):
+        soup = BeautifulSoup(text, "html.parser")
+        soup.div.string = self.update_data.get(update_key, [''])[order]
+        return self._get_spaces(text) + str(soup)
+
+    def update(self, content, text):
+        try:
+            if 'title mb' in text:
+                text = self._update_div_data(text, 'title_mb_text', self.card_text_order)
+                self.title_mb_text_order += 1
+                if self.title_mb_text_order == 2:
+                    self.is_pure_div = True
+            elif self.is_pure_div:
+                text = self._update_div_data(text, 'pure_div_text', self.is_pure_div_order)
+                self.is_pure_div_order += 1
+                self.is_pure_div = False
+            elif 'mb-5' in text:
+                text = self._update_div_data(text, 'mb_5_text', self.mb_5_order)
+                self.mb_5_order += 1
+            elif '<a ' in text:
+                soup = BeautifulSoup(text, "html.parser")
+                soup.a.string = self.update_data.get('a', [''])[self.a_order]
+                text = self._get_spaces(text) + str(soup)
+                self.a_order += 1
+        except Exception as err:
+            logger.error('section parser failed with {}'.format(err))
+        finally:
+            content += text
+        return content
+
+
+SECTION_PARSER = {'section18': Section18SectionParser}

+ 78 - 0
models/utils/validators.py

@@ -0,0 +1,78 @@
+def is_valid_section(section_class_name, text):
+    return IS_VALID_SECTION.get(section_class_name, unvalid_section)(text)
+
+
+def is_valid_section3_section(text):
+    return 'class="section3"' in text
+
+
+def is_valid_section13_overly_section(text):
+    return 'class="section13 overly"' in text
+
+
+def is_valid_section14_section(text):
+    return 'class="section14"' in text
+
+
+def is_valid_section13_section(text):
+    return 'class="section13"' in text
+
+
+def is_valid_section16_section(text):
+    return 'class="section16"' in text
+
+
+def is_valid_section17_section(text):
+    return 'class="section17"' in text
+
+
+def is_valid_section18_section(text):
+    return 'class="section18"' in text
+
+
+def is_valid_section19_section(text):
+    return 'class="section19"' in text
+
+
+def is_valid_section22_section(text):
+    return 'class="section22"' in text
+
+
+def is_valid_section25_section(text):
+    return 'class="section25"' in text
+
+
+def is_valid_section26_section(text):
+    return 'class="section26"' in text
+
+
+def is_valid_section27_section(text):
+    return 'class="section27"' in text
+
+
+def is_valid_section28_section(text):
+    return 'class="section28"' in text
+
+
+def is_valid_section29_section(text):
+    return 'class="section29"' in text
+
+
+def unvalid_section(text):
+    return None
+
+
+IS_VALID_SECTION = {'section3': is_valid_section3_section,
+                    'section13_overly': is_valid_section13_overly_section,
+                    'section13': is_valid_section13_section,
+                    'section14': is_valid_section14_section,
+                    'section16': is_valid_section16_section,
+                    'section17': is_valid_section17_section,
+                    'section18': is_valid_section18_section,
+                    'section19': is_valid_section19_section,
+                    'section22': is_valid_section22_section,
+                    'section25': is_valid_section25_section,
+                    'section26': is_valid_section26_section,
+                    'section27': is_valid_section27_section,
+                    'section28': is_valid_section28_section,
+                    'section29': is_valid_section29_section}