Skip to content

Support encoding indefinite containers #256

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 33 additions & 5 deletions cbor2/_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ class CBOREncoder:
"string_referencing",
"string_namespacing",
"_string_references",
"indefinite_containers",
)

_fp: IO[bytes]
Expand All @@ -138,6 +139,7 @@ def __init__(
canonical: bool = False,
date_as_datetime: bool = False,
string_referencing: bool = False,
indefinite_containers: bool = False,
):
"""
:param fp:
Expand Down Expand Up @@ -168,6 +170,8 @@ def __init__(
:param string_referencing:
set to ``True`` to allow more efficient serializing of repeated string
values
:param indefinite_containers:
encode containers as indefinite (use stop code instead of specifying length)

"""
self.fp = fp
Expand All @@ -177,6 +181,7 @@ def __init__(
self.value_sharing = value_sharing
self.string_referencing = string_referencing
self.string_namespacing = string_referencing
self.indefinite_containers = indefinite_containers
self.default = default
self._canonical = canonical
self._shared_containers: dict[
Expand Down Expand Up @@ -395,9 +400,11 @@ def _stringref(self, value: str | bytes) -> bool:

return False

def encode_length(self, major_tag: int, length: int) -> None:
def encode_length(self, major_tag: int, length: int | None) -> None:
major_tag <<= 5
if length < 24:
if length is None: # Indefinite
self._fp_write(struct.pack(">B", major_tag | 31))
elif length < 24:
self._fp_write(struct.pack(">B", major_tag | length))
elif length < 256:
self._fp_write(struct.pack(">BB", major_tag | 24, length))
Expand All @@ -408,6 +415,10 @@ def encode_length(self, major_tag: int, length: int) -> None:
else:
self._fp_write(struct.pack(">BQ", major_tag | 27, length))

def encode_break(self) -> None:
# Break stop code for indefinite containers
self._fp_write(struct.pack(">B", (7 << 5) | 31))

def encode_int(self, value: int) -> None:
# Big integers (2 ** 64 and over)
if value >= 18446744073709551616 or value < -18446744073709551616:
Expand Down Expand Up @@ -446,17 +457,23 @@ def encode_string(self, value: str) -> None:

@container_encoder
def encode_array(self, value: Sequence[Any]) -> None:
self.encode_length(4, len(value))
self.encode_length(4, len(value) if not self.indefinite_containers else None)
for item in value:
self.encode(item)

if self.indefinite_containers:
self.encode_break()

@container_encoder
def encode_map(self, value: Mapping[Any, Any]) -> None:
self.encode_length(5, len(value))
self.encode_length(5, len(value) if not self.indefinite_containers else None)
for key, val in value.items():
self.encode(key)
self.encode(val)

if self.indefinite_containers:
self.encode_break()

def encode_sortable_key(self, value: Any) -> tuple[int, bytes]:
"""
Takes a key and calculates the length of its optimal byte
Expand All @@ -471,7 +488,7 @@ def encode_sortable_key(self, value: Any) -> tuple[int, bytes]:
def encode_canonical_map(self, value: Mapping[Any, Any]) -> None:
"""Reorder keys according to Canonical CBOR specification"""
keyed_keys = ((self.encode_sortable_key(key), key, value) for key, value in value.items())
self.encode_length(5, len(value))
self.encode_length(5, len(value) if not self.indefinite_containers else None)
for sortkey, realkey, value in sorted(keyed_keys):
if self.string_referencing:
# String referencing requires that the order encoded is
Expand All @@ -482,6 +499,9 @@ def encode_canonical_map(self, value: Mapping[Any, Any]) -> None:
self._fp_write(sortkey[1])
self.encode(value)

if self.indefinite_containers:
self.encode_break()

def encode_semantic(self, value: CBORTag) -> None:
# Nested string reference domains are distinct
old_string_referencing = self.string_referencing
Expand Down Expand Up @@ -699,6 +719,7 @@ def dumps(
canonical: bool = False,
date_as_datetime: bool = False,
string_referencing: bool = False,
indefinite_containers: bool = False,
) -> bytes:
"""
Serialize an object to a bytestring.
Expand Down Expand Up @@ -730,6 +751,8 @@ def dumps(
the default behavior in previous releases (cbor2 <= 4.1.2).
:param string_referencing:
set to ``True`` to allow more efficient serializing of repeated string values
:param indefinite_containers:
encode containers as indefinite (use stop code instead of specifying length)
:return: the serialized output

"""
Expand All @@ -743,6 +766,7 @@ def dumps(
canonical=canonical,
date_as_datetime=date_as_datetime,
string_referencing=string_referencing,
indefinite_containers=indefinite_containers,
).encode(obj)
return fp.getvalue()

Expand All @@ -757,6 +781,7 @@ def dump(
canonical: bool = False,
date_as_datetime: bool = False,
string_referencing: bool = False,
indefinite_containers: bool = False,
) -> None:
"""
Serialize an object to a file.
Expand Down Expand Up @@ -788,6 +813,8 @@ def dump(
:param date_as_datetime:
set to ``True`` to serialize date objects as datetimes (CBOR tag 0), which was
the default behavior in previous releases (cbor2 <= 4.1.2).
:param indefinite_containers:
encode containers as indefinite (use stop code instead of specifying length)
:param string_referencing:
set to ``True`` to allow more efficient serializing of repeated string values

Expand All @@ -801,4 +828,5 @@ def dump(
canonical=canonical,
date_as_datetime=date_as_datetime,
string_referencing=string_referencing,
indefinite_containers=indefinite_containers,
).encode(obj)
1 change: 1 addition & 0 deletions docs/versionhistory.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ This library adheres to `Semantic Versioning <https://semver.org/>`_.

- Dropped support for Python 3.8
(#247 <https://github.com/agronholm/cbor2/pull/247>_; PR by @hugovk)
- Added support for encoding indefinite containers (PR by @CZDanol)

**5.6.5** (2024-10-09)

Expand Down
91 changes: 76 additions & 15 deletions source/encoder.c
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ CBOREncoder_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
self->shared_handler = NULL;
self->string_referencing = false;
self->string_namespacing = false;
self->indefinite_containers = false;
}
return (PyObject *) self;
}
Expand All @@ -126,16 +127,16 @@ CBOREncoder_init(CBOREncoderObject *self, PyObject *args, PyObject *kwargs)
{
static char *keywords[] = {
"fp", "datetime_as_timestamp", "timezone", "value_sharing", "default",
"canonical", "date_as_datetime", "string_referencing", NULL
"canonical", "date_as_datetime", "string_referencing", "indefinite_containers", NULL
};
PyObject *tmp, *fp = NULL, *default_handler = NULL, *tz = NULL;
int value_sharing = 0, timestamp_format = 0, enc_style = 0,
date_as_datetime = 0, string_referencing = 0;
date_as_datetime = 0, string_referencing = 0, indefinite_containers = 0;

if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|pOpOppp", keywords,
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|pOpOpppp", keywords,
&fp, &timestamp_format, &tz, &value_sharing,
&default_handler, &enc_style, &date_as_datetime,
&string_referencing))
&string_referencing, &indefinite_containers))
return -1;
// Predicate values are returned as ints, but need to be stored as bool or ubyte
if (timestamp_format == 1)
Expand All @@ -150,6 +151,8 @@ CBOREncoder_init(CBOREncoderObject *self, PyObject *args, PyObject *kwargs)
self->string_referencing = true;
self->string_namespacing = true;
}
if (indefinite_containers == 1)
self->indefinite_containers = true;


if (_CBOREncoder_set_fp(self, fp, NULL) == -1)
Expand Down Expand Up @@ -345,17 +348,19 @@ CBOREncoder_write(CBOREncoderObject *self, PyObject *data)
Py_RETURN_NONE;
}


static int
encode_length(CBOREncoderObject *self, const uint8_t major_tag,
const uint64_t length)
encode_length_possibly_indefinite(CBOREncoderObject *self, const uint8_t major_tag,
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain why you had to split this function into two parts?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. I wanted to reduce diff by keeping rest of the encode_length calls the same (not having to add the extra parameter everywhere)
  2. I felt that the bool parameter would be somewhat cryptic if there was not some addition to the function name, so I created encode_length_possibly_indefinite. Most of the use cases don't care about indefinite encoding, so it didn't make sense to drag this function there - so we have two now.

Copy link
Owner

@agronholm agronholm Apr 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I took a better look at this, and it doesn't make a whole lot of sense. uint64_t cannot possibly be -1 since it's unsigned. The actual value would then become a very large integer (18446744073709551615) instead, yes? While I doubt anyone will really try to actually encode such enormous structures, it does raise other questions, like if you felt this was an acceptable sentinel value, why then did you have to split encode_length() instead of just using -1 as the sentinel for indefinite length? And why can't encode_length() just directly look at self->indefinite_containers and if it's true, ignore the length parameter?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

encode_length cannot directly use indefinite_containers, because it is also used on other places outside of encoding array/map length.

const uint64_t length, const bool indefinite)
{
LeadByte *lead;
char buf[sizeof(LeadByte) + sizeof(uint64_t)];

lead = (LeadByte*)buf;
lead->major = major_tag;
if (length < 24) {
if (indefinite) {
lead->subtype = 31;
return fp_write(self, buf, 1);
} else if (length < 24) {
lead->subtype = (uint8_t) length;
return fp_write(self, buf, 1);
} else if (length <= UCHAR_MAX) {
Expand All @@ -377,21 +382,62 @@ encode_length(CBOREncoderObject *self, const uint8_t major_tag,
}
}

static int
encode_length(CBOREncoderObject *self, const uint8_t major_tag,
const uint64_t length) {
return encode_length_possibly_indefinite(self, major_tag, length, false);
}

int uint64_or_none(PyObject *obj, void *param) {
if (obj == Py_None) {
return 1;
} else if (PyLong_Check(obj)) {
const uint64_t val = PyLong_AsUnsignedLong(obj);
if (PyErr_Occurred()) {
return 0;
}

*((uint64_t*)param) = val;
return 1;
} else {
PyErr_SetString(PyExc_TypeError, "must be int or None");
return 0;
}
}

// CBOREncoder.encode_length(self, major_tag, length)
static PyObject *
CBOREncoder_encode_length(CBOREncoderObject *self, PyObject *args)
{
uint8_t major_tag;
uint64_t length;
uint64_t length = -1;
Copy link
Author

@CZDanol CZDanol Apr 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, -1 becomes a very large integer.

I did this because I was too lazy to set up a custom structure for the uint64_or_none that would be required for PyArg_ParseTuple. It would be cleaner to have it distinct, yes. It is somewhat dirty, so I didn't want to bring this approach to other places.

Should I remake it?


if (!PyArg_ParseTuple(args, "BK", &major_tag, &length))
if (!PyArg_ParseTuple(args, "BO&", &major_tag, &uint64_or_none, &length))
return NULL;
if (encode_length(self, major_tag, length) == -1)
if (encode_length_possibly_indefinite(self, major_tag, length, length == -1) == -1)
return NULL;
Py_RETURN_NONE;
}

static int
encode_break(CBOREncoderObject *self)
{
LeadByte lead;
lead.major = 7;
lead.subtype = 31;
return fp_write(self, (const char*) &lead, 1);
}

// CBOREncoder.encode_break(self)
static PyObject *
CBOREncoder_encode_break(CBOREncoderObject *self)
{
if (encode_break(self) == -1) {
return NULL;
}
Py_RETURN_NONE;
}


// Given a deferred type tuple (module-name, type-name), find the specified
// module in sys.modules, get the specified type from within it and return it
Expand Down Expand Up @@ -761,7 +807,7 @@ encode_array(CBOREncoderObject *self, PyObject *value)
if (fast) {
length = PySequence_Fast_GET_SIZE(fast);
items = PySequence_Fast_ITEMS(fast);
if (encode_length(self, 4, length) == 0) {
if (encode_length_possibly_indefinite(self, 4, length, self->indefinite_containers) == 0) {
while (length) {
ret = CBOREncoder_encode(self, *items);
if (ret)
Expand All @@ -774,6 +820,9 @@ encode_array(CBOREncoderObject *self, PyObject *value)
Py_INCREF(Py_None);
ret = Py_None;
}
if (self->indefinite_containers && encode_break(self) == -1) {
goto error;
}
error:
Py_DECREF(fast);
}
Expand All @@ -796,7 +845,7 @@ encode_dict(CBOREncoderObject *self, PyObject *value)
PyObject *key, *val, *ret;
Py_ssize_t pos = 0;

if (encode_length(self, 5, PyDict_Size(value)) == 0) {
if (encode_length_possibly_indefinite(self, 5, PyDict_Size(value), self->indefinite_containers) == 0) {
while (PyDict_Next(value, &pos, &key, &val)) {
Py_INCREF(key);
ret = CBOREncoder_encode(self, key);
Expand All @@ -813,7 +862,11 @@ encode_dict(CBOREncoderObject *self, PyObject *value)
else
return NULL;
}
if (self->indefinite_containers && encode_break(self) == -1) {
return NULL;
}
}

Py_RETURN_NONE;
}

Expand All @@ -830,7 +883,7 @@ encode_mapping(CBOREncoderObject *self, PyObject *value)
if (fast) {
length = PySequence_Fast_GET_SIZE(fast);
items = PySequence_Fast_ITEMS(fast);
if (encode_length(self, 5, length) == 0) {
if (encode_length_possibly_indefinite(self, 5, length, self->indefinite_containers) == 0) {
while (length) {
ret = CBOREncoder_encode(self, PyTuple_GET_ITEM(*items, 0));
if (ret)
Expand All @@ -845,6 +898,9 @@ encode_mapping(CBOREncoderObject *self, PyObject *value)
items++;
length--;
}
if (self->indefinite_containers && encode_break(self) == -1) {
goto error;
}
ret = Py_None;
Py_INCREF(ret);
}
Expand Down Expand Up @@ -1728,7 +1784,7 @@ encode_canonical_map_list(CBOREncoderObject *self, PyObject *list)

if (PyList_Sort(list) == -1)
return NULL;
if (encode_length(self, 5, PyList_GET_SIZE(list)) == -1)
if (encode_length_possibly_indefinite(self, 5, PyList_GET_SIZE(list), self->indefinite_containers) == -1)
return NULL;
for (index = 0; index < PyList_GET_SIZE(list); ++index) {
// If we are encoding string references, the order of the keys
Expand All @@ -1753,6 +1809,9 @@ encode_canonical_map_list(CBOREncoderObject *self, PyObject *list)
else
return NULL;
}
if (self->indefinite_containers && encode_break(self) == -1) {
return NULL;
}
Py_RETURN_NONE;
}

Expand Down Expand Up @@ -2114,6 +2173,8 @@ static PyMethodDef CBOREncoder_methods[] = {
{"encode_length", (PyCFunction) CBOREncoder_encode_length, METH_VARARGS,
"encode the specified *major_tag* with the specified *length* to "
"the output"},
{"encode_break", (PyCFunction) CBOREncoder_encode_break, METH_NOARGS,
"encode break stop code for indefinite containers"},
{"encode_int", (PyCFunction) CBOREncoder_encode_int, METH_O,
"encode the specified integer *value* to the output"},
{"encode_float", (PyCFunction) CBOREncoder_encode_float, METH_O,
Expand Down
1 change: 1 addition & 0 deletions source/encoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ typedef struct {
bool value_sharing;
bool string_referencing;
bool string_namespacing;
bool indefinite_containers;
} CBOREncoderObject;

extern PyTypeObject CBOREncoderType;
Expand Down
Loading
Loading