srtparser.py 1.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
  1. '''
  2. (C) 2019 Raryel C. Souza
  3. This program is free software: you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation, either version 3 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program. If not, see <https://www.gnu.org/licenses/>.
  13. '''
  14. import re, sys
  15. class SRTParser(object):
  16. @staticmethod
  17. def extractTextFromSRT(fileSRT):
  18. file_name = fileSRT
  19. file_encoding = 'utf-8'
  20. #loop through the lines for parsing
  21. with open(file_name, encoding=file_encoding, errors='replace') as f:
  22. lines = f.readlines()
  23. new_lines = SRTParser.clean_up(lines)
  24. new_file_name = file_name[:-4] + '.txt'
  25. #write parsed txt file
  26. with open(new_file_name, 'w', encoding=file_encoding) as f:
  27. for line in new_lines:
  28. f.write(line)
  29. @staticmethod
  30. def clean_up(lines):
  31. regexSubtitleIndexNumber = re.compile("[0-9]+")
  32. new_lines = []
  33. for line in lines[1:]:
  34. #if line empty or
  35. #if line contains --> or
  36. #if line matches the subtitle index regex
  37. #then skip line
  38. if (not line or not line.strip()) or ("-->" in line) or regexSubtitleIndexNumber.match(line):
  39. continue
  40. else:
  41. #append line
  42. new_lines.append(line)
  43. return new_lines