Как исправить кодировку? ignore не помогает. Заранее знать кодировку файла тоже невозможно.
Библиотека
https://github.com/btimby/fulltext
Мой код:
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
import fulltext as ft
import sys
debug = True
def gt(f):
encodings = ("utf8", "cp1251")
t = ""
for encoding in encodings:
e = None
try:
t = ft.get(f, encoding = encoding, encoding_errors = 'ignore')
break
except UnicodeDecodeError:
e = sys.exc_info()
if t == "" and e[0] == UnicodeDecodeError:
raise e[1]
return t
print(gt("book.epub"))
Ошибка:
'utf-8' codec can't decode byte 0xfb in position 10: invalid start byte
Traceback (most recent call last):
File "ft.py", line 31, in <module>
print(gt(sys.argv[1]))
File "ft.py", line 20, in gt
raise e[1]
File "ft.py", line 15, in gt
t = ft.get(f, encoding = encoding, encoding_errors = 'ignore')
File "/usr/local/lib/python3.7/site-packages/fulltext/__init__.py", line 154, in get
backend, path_or_file, mime=mime, name=name, **kwargs)
File "/usr/local/lib/python3.7/site-packages/fulltext/__init__.py", line 90, i n _get_path
return backend._get_file(f, **kwargs)
File "/usr/local/lib/python3.7/site-packages/fulltext/backends/__text.py", lin e 20, in _get_file
text = f.read(BUFFER_MAX)
File "/usr/local/lib/python3.7/codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfb in position 10: invalid start byte
root@v43195:/var/www/html/stc# clear
root@v43195:/var/www/html/stc# python3 ft.py Davydenko_Uchilka.508202.fb2.epub
'utf-8' codec can't decode byte 0xfb in position 10: invalid start byte
Traceback (most recent call last):
File "ft.py", line 31, in <module>
print(gt(sys.argv[1]))
File "ft.py", line 20, in gt
raise e[1]
File "ft.py", line 15, in gt
t = ft.get(f, encoding = encoding, encoding_errors = 'ignore')
File "/usr/local/lib/python3.7/site-packages/fulltext/__init__.py", line 154, in get
backend, path_or_file, mime=mime, name=name, **kwargs)
File "/usr/local/lib/python3.7/site-packages/fulltext/__init__.py", line 90, i n _get_path
return backend._get_file(f, **kwargs)
File "/usr/local/lib/python3.7/site-packages/fulltext/backends/__text.py", lin e 20, in _get_file
text = f.read(BUFFER_MAX)
File "/usr/local/lib/python3.7/codecs.py", line 322, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfb in position 10: invalid start byte