@@ -95,7 +95,8 @@ digraph {
9595
9696** Items** are the elements of a DataSlice, and can be primitives (e.g. integers
9797or strings), or more complex data structures (e.g. lists, dicts and entities).
98- DataSlices without dimension are called DataItem which always has one item.
98+
99+ A zero-dimensional DataSlice is a scalar item. It is called a DataItem.
99100
100101``` py
101102kd.item(1 , schema = kd.FLOAT32 ) # 1.
@@ -109,9 +110,9 @@ kd.slice([kd.list([1, 2, 3]), kd.list([4, 5])]) # DataSlice of lists
109110kd.slice([kd.dict({' a' :1 , ' b' :2 }), kd.dict({' c' :3 })]) # DataSlice of dicts
110111```
111112
112- NOTE: A ** DataSlice of lists ** is different from a DataSlice with list elements.
113- However, they can be converted from each other as we will show in the later
114- section .
113+ The DataSlice ` kd.slice([kd.list([1, 2, 3]), kd.list([4, 5])]) ` is different
114+ from the DataSlice ` kd.slice([[1, 2, 3], [4, 5]]) ` , as the following example
115+ shows. However, they can be converted from/to each other as we will see later .
115116
116117``` py
117118l1 = kd.list([1 , 2 , 3 ])
@@ -152,7 +153,7 @@ kd.group_by(ds) # [[4, 4, 4], [3], [2, 2, 2], [1, 1]]
152153kd.group_by(ds).take(0 ) # [4, 3, 2, 1]
153154kd.unique(ds) # the same as above
154155
155- # Group by can be used to swap dimensions
156+ # Group_by can be used to swap dimensions, which can be used to transpose a matrix
156157ds = kd.slice([[1 , 2 , 3 ], [4 , 5 , 6 ], [7 , 8 , 9 ]])
157158kd.group_by(ds.flatten(), kd.index(ds, dim = 1 ).flatten()) # [[1, 4, 7], [2, 5, 8], [3, 6, 9]]
158159```
@@ -224,12 +225,11 @@ Both entities and schemas can be dynamically **allocated** or be
224225x = kd.new(a = 1 , b = kd.new(c = 3 ))
225226
226227# Entities with auto-allocated schemas cannot be mixed together in vectorized ops
227- # kd.new(x=1, schema='s1').get_schema() == kd.new(x=1, schema='s2').get_schema()
228228kd.new(x = 1 ).get_schema() != kd.new(x = 1 ).get_schema() # yes
229229
230230# Auto-allocated schemas can be cast to have the same schema
231231x, y = kd.new(a = 1 ), kd.new(b = 2 ) # two entites with different schemas
232- kd.slice([x, y.with_schema(x.get_schema())]) # the same as above
232+ kd.slice([x, y.with_schema(x.get_schema())])
233233
234234# Universally unique entities can be used similarly to named tuples
235235kd.uu(x = 1 , y = kd.uu(z = 3 ))
@@ -240,7 +240,7 @@ kd.new(x=1, y=2).get_itemid() != kd.new(x=1, y=2).get_itemid() # yes
240240# Universally-uniquely allocated entities have always the same ids
241241kd.uu(x = 1 , y = 2 ).get_itemid() == kd.uu(x = 1 , y = 2 ).get_itemid() # yes
242242
243- # Can encode itemid's intro strings
243+ # Can encode itemid's into strings
244244kd.encode_itemid(kd.new(x = 1 , y = 2 )) # always different, as ids are allocated
245245kd.encode_itemid(kd.uu(x = 1 , y = 2 )) == ' 07aXeaqDy6UJNv8EUfA0jz' # always the same
246246```
@@ -293,12 +293,13 @@ a.x # [1, 2, 3]
293293a = kd.new(x = kd.slice([1 , 2 , 3 ]), schema = ' Foo' ) # The same as above, but more compact
294294
295295a = kd.slice([kd.obj(x = 1 ), kd.obj(y = 2 ), kd.obj(z = 3 )])
296- a.maybe(' x' ) # [1, None, None] - only the first one has 'x' attr
296+
297+ a.maybe(' x' ) # [1, None, None] - only the first one has an attribute 'x'
297298```
298299
299300When accessing a ** single element** of a ** DataSlice of lists** or a ** key** of
300- a ** DataSlice of dicts** , a new DataSlice is returned with corresponding values
301- in original lists and dicts.
301+ a ** DataSlice of dicts** , a new DataSlice is returned with the corresponding
302+ values in the original lists and dicts.
302303
303304``` py
304305a = kd.slice([kd.list([1 , 2 , 3 ]), kd.list([4 , 5 ])])
@@ -322,25 +323,24 @@ a[1] # [2, 5] == [list0[1], list1[1]]
322323# That is, 1-dim DataSlice of lists becomess 2-dim DataSlice
323324a[:] # [[1, 2, 3],[4, 5]]
324325
325- # "Explosion", but access only the first two items in each list
326+ # "Explosion" of the first two items in each list
326327a[:2 ] # [[1, 2], [4, 5]]
327- a[:].get_ndim() == a.get_ndim() + 1 # explosion adds dimenstions
328+ a[:].get_ndim() == a.get_ndim() + 1 # explosion adds one dimension
328329```
329330
330331An opposite operation is ** implosion** , when we return a DataSlice of lists with
331- one less dimensions , where each list contains the values of the innermost
332- dimension of the original DataSlices .
332+ one fewer dimension , where each list contains the values of the innermost
333+ dimension of the original DataSlice .
333334
334335``` py
335- # Implode replaces the last dimensions with lists
336+ # Implode replaces the last dimension with lists
336337a = kd.slice([[1 , 2 , 3 ], [4 , 5 ]])
337338kd.implode(a) # kd.slice([kd.list([1,2,3]), kd.list([4,5])])
338339kd.implode(a)[:] # == a
339340```
340341
341- ** Keys** and ** values** of ** dicts** are also ** exploded lists** . That is,
342- getting all keys or values of a DataSlice of dicts, returns a DataSlice with one
343- more dimension.
342+ Getting all keys or values of a DataSlice of dicts will return a DataSlice with
343+ one more dimension.
344344
345345``` py
346346a = kd.slice([kd.dict({' a' : 1 , ' b' : 2 }), kd.dict({' b' : 3 , ' c' : 4 })])
@@ -350,12 +350,12 @@ a.get_values() # [[1, 2], [3, 4]]
350350# shortcut for get_value
351351a[:] # [[1, 2], [3, 4]]
352352
353- # note, get_keys() doesn't guarantee to preserve the order, but we can sort
353+ # note, get_keys() doesn't guarantee to preserve the order, but we can sort before lookup
354354a[kd.sort(a.get_keys())] # [[1, 2], [3, 4]]
355- a.get_keys().get_ndim() == a.get_ndim() + 1 # keys are exploded lists
355+ a.get_keys().get_ndim() == a.get_ndim() + 1 # the keys DataSlice has one more dimension
356356```
357357
358- Below is an example of putting everything together.
358+ Here is an example that puts everything together.
359359
360360``` py
361361a = kd.from_py([{' x' : 1 }, {' x' : 3 }], dict_as_obj = True )
@@ -368,12 +368,15 @@ kd.zip(kd.agg_sum(a[:].x), kd.agg_sum(b[:]['y'])) # [4, 6]
368368## Objects
369369
370370To make possible mixing different primitives or entities/lists/dicts with
371- different schemas in the same DataSlices, Koda uses ** objects** , which store
372- their own schema ** similar to python objects** which store their classes as
373- ` __class__ ` attribute.
371+ different schemas in a single DataSlice, Koda uses ** objects** , which store
372+ their schema in their data.
373+
374+ There are two main kinds of objects in Koda:
374375
375- NOTE: primitives are considered as objects though they cannot have attributes
376- because their schemas are embedded in the data.
376+ * Primitives, such as integers and strings.
377+ * Objects that can have attributes and that use a special attribute to store
378+ their schemas. They are ** similar to Python objects** that store their
379+ classes in the ` __class__ ` attribute.
377380
378381``` py
379382kd.obj(x = 2 , y = kd.obj(z = 3 ))
@@ -392,7 +395,7 @@ kd.slice([kd.obj(x=1,y=2), kd.obj(x="hello", y="world"), kd.obj(1)])
392395kd.obj(x = 1 ).get_schema() # kd.OBJECT
393396kd.obj(x = 1 ).get_schema() == kd.obj(1 ).get_schema() # yes
394397
395- # Get per-item schemas stored in each objects
398+ # Get per-item schemas stored in every object
396399kd.obj(x = 1 ).get_obj_schema() # IMPLICIT_SCHEMA(x=INT32)
397400kd.obj(x = 1 ).get_obj_schema() != kd.obj(1 ).get_obj_schema() # yes, different actual schemas
398401kd.slice([kd.obj(x = 1 ,y = 2 ), kd.obj(x = " hello" , y = " world" ), kd.obj(1 )]).get_obj_schema()
@@ -423,10 +426,11 @@ kd.obj(x=1, y=2).with_schema(my_schema)
423426kd.from_py({' x' : 1 , ' y' : 2 }, dict_as_obj = True ).with_schema(my_schema) # the same as above
424427```
425428
426- Note: There is additional ** performance cost** during vectorized operations, as
427- each item can have its own schema in this case, and different objects might have
428- different sets of attributes. For large data, using entities with explicit
429- schemas is recommended.
429+ Note: Compared to entities, objects have a higher ** performance overhead**
430+ during vectorized operations, as each object in a DataSlice has its own schema,
431+ and different objects in the same DataSlice might have different sets of
432+ attributes. For large data, the use of entities with explicit schemas is
433+ recommended for faster execution.
430434
431435Similar to entities, lists and dicts can be objects too.
432436
@@ -446,13 +450,15 @@ assert d_objs.get_schema() == kd.OBJECT
446450d_objs.get_obj_schema() # [DICT{STRING, INT32}, DICT{INT32, BOOLEAN}]
447451```
448452
449- Primitives can be treated as objects too as their schemas can be inferred from
450- the values.
453+ Primitives are also objects. Their schemas are inferred from their values.
451454
452455``` py
453- kd.obj(1 ) # INT32
456+ kd.obj(1 )
454457kd.obj(kd.int64(1 ))
455- kd.obj(' hello' ) # STRING
458+ kd.obj(' hello' )
459+
460+ assert kd.obj(1 ).get_schema() == kd.OBJECT
461+ assert kd.obj(1 ).get_obj_schema() == kd.INT32
456462
457463# Dict values are objects
458464# No need to wrap them using kd.obj
@@ -462,8 +468,8 @@ d.get_schema() # DICT{STRING, OBJECT}
462468
463469## Sparsity and Masks
464470
465- ** Sparsity** is a first-class citizen in Koda. Every item in a DataSlice can be
466- present or missing and all operators support sparse DataSlice .
471+ ** Sparsity** is a first-class concept in Koda. Every item in a DataSlice can be
472+ present or missing and all operators support missing values .
467473
468474``` py
469475a = kd.slice([[1 , None ], [4 ]])
@@ -473,8 +479,8 @@ kd.agg_any(a) # [present, present]
473479kd.agg_all(a) # [missing, present]
474480```
475481
476- ** Masks** are used to represent present/missing state, as well as used ** instead
477- of booleans ** for comparison and logical ops .
482+ ** Masks** are used to represent present/missing state. They are also used in
483+ comparison and logical operations .
478484
479485``` py
480486# Get the sparsity of a DataSlice
@@ -483,6 +489,11 @@ kd.slice([1, None, 3, 4]) != 3 # [present, missing, missing, present]
483489kd.slice([1 , 2 , 3 , 4 ]) > 2 # [missing, missing, present, present]
484490```
485491
492+ Using masks instead of Booleans in comparison and logical operations is useful
493+ because masks have a 2-valued logic. In the presence of missing values, the
494+ Booleans have a 3-valued logic (over True, False, missing), which is more
495+ complex and confusing.
496+
486497Masks can be used to ** filter** or ** select** items in a DataSlice. The
487498difference is that filtering does not change the shape of the DataSlice and
488499filtered out items become missing, while selection changes the shape by only
0 commit comments