Generate tfrecord.py : UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfd in position 68: invalid start byte

from __future__ import division
from __future__ import print_function
from __future__ import absolute_import

import os
import io
import pandas as pd
import tensorflow.compat.v1 as tf
from PIL import Image
from object_detection.utils import dataset_util
from collections import namedtuple, OrderedDict


flags = tf.app.flags
flags.DEFINE_string('csv_input', '', 'Path to the CSV input')
flags.DEFINE_string('output_path', '', 'Path to output TFRecord')
flags.DEFINE_string('image_dir', '', 'Path to images')
FLAGS = flags.FLAGS


# TO-DO replace this with label map
def class_text_to_int(row_label):
    if row_label == 'raccoon':
        return 1
    else:
        None


def split(df, group):
    data = namedtuple('data', ['filename', 'object'])
    gb = df.groupby(group)
    return [data(filename, gb.get_group(x)) for filename, x in zip(gb.groups.keys(), gb.groups)]


def create_tf_example(group, path):
    with tf.gfile.GFile(os.path.join(path, '{}'.format(group.filename)), 'rb') as fid:
        encoded_jpg = fid.read()
    encoded_jpg_io = io.BytesIO(encoded_jpg)
    image = Image.open(encoded_jpg_io)
    width, height = image.size

    filename = group.filename.encode('utf8')
    image_format = b'jpg'
    xmins = []
    xmaxs = []
    ymins = []
    ymaxs = []
    classes_text = []
    classes = []

    for index, row in group.object.iterrows():
        xmins.append(row['xmin'] / width)
        xmaxs.append(row['xmax'] / width)
        ymins.append(row['ymin'] / height)
        ymaxs.append(row['ymax'] / height)
        classes_text.append(row['class'].encode('utf8'))
        classes.append(class_text_to_int(row['class']))

    tf_example = tf.train.Example(features=tf.train.Features(feature={
        'image/height': dataset_util.int64_feature(height),
        'image/width': dataset_util.int64_feature(width),
        'image/filename': dataset_util.bytes_feature(filename),
        'image/source_id': dataset_util.bytes_feature(filename),
        'image/encoded': dataset_util.bytes_feature(encoded_jpg),
        'image/format': dataset_util.bytes_feature(image_format),
        'image/object/bbox/xmin': dataset_util.float_list_feature(xmins),
        'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs),
        'image/object/bbox/ymin': dataset_util.float_list_feature(ymins),
        'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs),
        'image/object/class/text': dataset_util.bytes_list_feature(classes_text),
        'image/object/class/label': dataset_util.int64_list_feature(classes),
    }))
    return tf_example


def main(_):
    writer = tf.python_io.TFRecordWriter(FLAGS.output_path)
    path = os.path.join(FLAGS.image_dir)
    examples = pd.read_csv(FLAGS.csv_input)
    grouped = split(examples, 'filename')
    for group in grouped:
        tf_example = create_tf_example(group, path)
        writer.write(tf_example.SerializeToString())

    writer.close()
    output_path = os.path.join(os.getcwd(), FLAGS.output_path)
    print('Successfully created the TFRecords: {}'.format(output_path))


if __name__ == '__main__':
    tf.app.run()

code error: generate tfrecord.py : UnicodeDecodeError: ‘utf-8’ codec can’t decode byte 0xfd in position 68: invalid start byte

Please copy and paste the entire traceback, not just the very last line.

The Python interpreter goes to a lot of trouble to generate the traceback, which is very useful for debugging.

Traceback (most recent call last):
File “C:\Users\asus\Downloads\custom object\generate_tfrecord.py”, line 91, in
tf.app.run()
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\tensorflow\python\platform\app.py”, line 36, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\absl\app.py”, line 308, in run
_run_main(main, args)
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\absl\app.py”, line 254, in _run_main
sys.exit(main(argv))
File “C:\Users\asus\Downloads\custom object\generate_tfrecord.py”, line 77, in main
writer = tf.python_io.TFRecordWriter(FLAGS.output_path)
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\tensorflow\python\lib\io\tf_record.py”, line 294, in init
super(TFRecordWriter, self).init(
UnicodeDecodeError: ‘utf-8’ codec can’t decode byte 0xfd in position 68: invalid start byte

It seems to be unhappy with the output path. It’s trying to convert a file path that’s a bytestring (class bytes) to str and is assuming that the bytestring is UTF-8, but it’s not.

The call in TFRecordWriter.__init__() on line 294 is as follows:

    super(TFRecordWriter, self).__init__(
        compat.as_bytes(path), options._as_record_writer_options())

The compat.as_bytes() function converts the path to bytes. There shouldn’t be a decode error at this step. It probably occurs in the parent class _pywrap_record_io.RecordWriter.

I presume that output_path is parsed from a command-line option. Assuming you’re using Python 2.x, then sys.argv is a byte string that’s encoded with the ANSI code page of the current process. In that case, I suggest that you switch to using Python 3, if possible, or else try changing the line in main() as follows:

    writer = tf.python_io.TFRecordWriter(FLAGS.output_path.decode('mbcs'))

On Windows, the “mbcs” encoding is based on the ANSI code page of the current process.

    writer = tf.python_io.TFRecordWriter(FLAGS.output_path.decode('mbcs'))

when I write as, this error:
Traceback (most recent call last):
File “C:\Users\asus\Downloads\custom object\generate_tfrecord.py”, line 91, in
tf.app.run()
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\tensorflow\python\platform\app.py”, line 36, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\absl\app.py”, line 308, in run
_run_main(main, args)
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\absl\app.py”, line 254, in _run_main
sys.exit(main(argv))
File “C:\Users\asus\Downloads\custom object\generate_tfrecord.py”, line 77, in main
writer = tf.python_io.TFRecordWriter(FLAGS.output_path.decode(‘mbcs’))
AttributeError: ‘str’ object has no attribute ‘decode’. Did you mean: ‘encode’?
when i write:

    writer = tf.python_io.TFRecordWriter(FLAGS.output_path.encode('mbcs'))

error:
Traceback (most recent call last):
File “C:\Users\asus\Downloads\custom object\generate_tfrecord.py”, line 91, in
tf.app.run()
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\tensorflow\python\platform\app.py”, line 36, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\absl\app.py”, line 308, in run
_run_main(main, args)
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\absl\app.py”, line 254, in _run_main
sys.exit(main(argv))
File “C:\Users\asus\Downloads\custom object\generate_tfrecord.py”, line 77, in main
writer = tf.python_io.TFRecordWriter(FLAGS.output_path.encode(‘mbcs’))
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\tensorflow\python\lib\io\tf_record.py”, line 294, in init
super(TFRecordWriter, self).init(
UnicodeDecodeError: ‘utf-8’ codec can’t decode byte 0xfd in position 68: invalid start byte
when i write:

    writer = tf.python_io.TFRecordWriter(FLAGS.output_path('mbcs'))

Traceback (most recent call last):
File “C:\Users\asus\Downloads\custom object\generate_tfrecord.py”, line 91, in
tf.app.run()
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\tensorflow\python\platform\app.py”, line 36, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\absl\app.py”, line 308, in run
_run_main(main, args)
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\absl\app.py”, line 254, in _run_main
sys.exit(main(argv))
File “C:\Users\asus\Downloads\custom object\generate_tfrecord.py”, line 77, in main
writer = tf.python_io.TFRecordWriter(FLAGS.output_path(‘mbcs’))
TypeError: ‘str’ object is not callable
when i write:

    writer = tf.python_io.TFRecordWriter('mbcs')

Traceback (most recent call last):
File “C:\Users\asus\Downloads\custom object\generate_tfrecord.py”, line 91, in
tf.app.run()
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\tensorflow\python\platform\app.py”, line 36, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\absl\app.py”, line 308, in run
_run_main(main, args)
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\absl\app.py”, line 254, in _run_main
sys.exit(main(argv))
File “C:\Users\asus\Downloads\custom object\generate_tfrecord.py”, line 79, in main
examples = pd.read_csv(FLAGS.csv_input)
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\util_decorators.py”, line 211, in wrapper
return func(*args, **kwargs)
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\util_decorators.py”, line 331, in wrapper
return func(*args, **kwargs)
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\readers.py”, line 950, in read_csv
return _read(filepath_or_buffer, kwds)
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\readers.py”, line 605, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\readers.py”, line 1442, in init
self._engine = self._make_engine(f, self.engine)
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\readers.py”, line 1735, in _make_engine
self.handles = get_handle(
File “C:\Users\asus\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\common.py”, line 856, in get_handle
handle = open(
FileNotFoundError: [Errno 2] No such file or directory: ‘’
“”“”“”“”“”“”“”“”“”“”“”“”“”“”“”“”“”“”“”“”“”“”“”“”“”“”“”“”“”“”“”“”“”“”
this erorr

I missed that you’re using Python 3.10 when I scanned over the traceback. The new error implies that FLAGS.output_path is already a 3.x str string. In that case, I’m at a loss to explain the original error. The compat.as_bytes() call should encode the path as UTF-8, for eventual use by an internal call that decodes it back to UTF-16 in order to call CreateFileW(). Because a C++ extension module is involved, I’d need to trace the call using both a Python debugger and a native debugger in order to understand how it’s getting to this state.

I’m afraid some libraries don’t work when I upgrade python