Notebooks
Data models - Jupyter Notebooks and components.
Cell (DatabooksBase)
pydantic-model
Jupyter notebook cells.
Fields outputs
and execution_count
are not included since they should only be
present in code cells - thus are treated as extra fields.
__hash__(self)
special
Cells must be hashable for difflib.SequenceMatcher
.
Source code in databooks/data_models/notebook.py
def __hash__(self) -> int:
"""Cells must be hashable for `difflib.SequenceMatcher`."""
return hash(
(type(self),) + tuple(v) if isinstance(v, list) else v
for v in self.__dict__.values()
)
cell_has_valid_type(v)
classmethod
Check if cell has one of the three predefined types.
Source code in databooks/data_models/notebook.py
@validator("cell_type")
def cell_has_valid_type(cls, v: str) -> str:
"""Check if cell has one of the three predefined types."""
valid_cell_types = ("raw", "markdown", "code")
if v not in valid_cell_types:
raise ValueError(f"Invalid cell type. Must be one of {valid_cell_types}")
return v
clear_metadata(self, *, cell_metadata_keep=None, cell_metadata_remove=None, cell_execution_count=True, cell_outputs=False, remove_fields=['id'])
Clear cell metadata, execution count and outputs.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
cell_metadata_keep |
Sequence[str] |
Metadata values to keep - simply pass an empty sequence (i.e.: |
None |
cell_metadata_remove |
Sequence[str] |
Metadata values to remove |
None |
cell_execution_count |
bool |
Whether or not to keep the execution count |
True |
cell_outputs |
bool |
whether or not to keep the cell outputs |
False |
Returns:
Type | Description |
---|---|
None |
Source code in databooks/data_models/notebook.py
def clear_metadata(
self,
*,
cell_metadata_keep: Sequence[str] = None,
cell_metadata_remove: Sequence[str] = None,
cell_execution_count: bool = True,
cell_outputs: bool = False,
remove_fields: List[str] = ["id"],
) -> None:
"""
Clear cell metadata, execution count and outputs.
:param cell_metadata_keep: Metadata values to keep - simply pass an empty
sequence (i.e.: `()`) to remove all extra fields.
:param cell_metadata_remove: Metadata values to remove
:param cell_execution_count: Whether or not to keep the execution count
:param cell_outputs: whether or not to keep the cell outputs
:return:
"""
nargs = sum((cell_metadata_keep is not None, cell_metadata_remove is not None))
if nargs != 1:
raise ValueError(
"Exactly one of `cell_metadata_keep` or `cell_metadata_remove` must"
f" be passed, got {nargs} arguments."
)
if cell_metadata_keep is not None:
cell_metadata_remove = tuple(
field for field, _ in self.metadata if field not in cell_metadata_keep
)
self.metadata.remove_fields(cell_metadata_remove) # type: ignore
self.remove_fields(remove_fields, missing_ok=True)
if self.cell_type == "code":
if cell_outputs:
self.outputs: List[Dict[str, Any]] = []
if cell_execution_count:
self.execution_count = None
must_not_be_list_for_code_cells(values)
classmethod
Check that code cells have list-type outputs.
Source code in databooks/data_models/notebook.py
@root_validator
def must_not_be_list_for_code_cells(cls, values: Dict[str, Any]) -> Dict[str, Any]:
"""Check that code cells have list-type outputs."""
if values["cell_type"] == "code" and not isinstance(values["outputs"], list):
raise ValueError(
"All code cells must have a list output property, got"
f" {type(values.get('outputs'))}"
)
return values
only_code_cells_have_outputs_and_execution_count(values)
classmethod
Check that only code cells have outputs and execution count.
Source code in databooks/data_models/notebook.py
@root_validator
def only_code_cells_have_outputs_and_execution_count(
cls, values: Dict[str, Any]
) -> Dict[str, Any]:
"""Check that only code cells have outputs and execution count."""
if values["cell_type"] != "code" and (
("outputs" in values) or ("execution_count" in values)
):
raise ValueError(
"Found `outputs` or `execution_count` for cell of type"
f" `{values['cell_type']}`"
)
return values
CellMetadata (DatabooksBase)
pydantic-model
Cell metadata. Empty by default but can accept extra fields.
Cells (GenericModel, BaseCells)
pydantic-model
Similar to list
, with -
operator using difflib.SequenceMatcher
.
data: List[T]
property
readonly
Define property data
required for collections.UserList
class.
__get_validators__()
classmethod
special
Get validators for custom class.
Source code in databooks/data_models/notebook.py
@classmethod
def __get_validators__(cls) -> Generator[Callable[..., Any], None, None]:
"""Get validators for custom class."""
yield cls.validate
__init__(self, elements=())
special
Allow passing data as a positional argument when instantiating class.
Source code in databooks/data_models/notebook.py
def __init__(self, elements: Sequence[T] = ()) -> None:
"""Allow passing data as a positional argument when instantiating class."""
super(Cells, self).__init__(__root__=elements)
__iter__(self)
special
Use list property as iterable.
Source code in databooks/data_models/notebook.py
def __iter__(self) -> Generator[Any, None, None]:
"""Use list property as iterable."""
return (el for el in self.data)
__sub__(self, other)
special
Return the difference using difflib.SequenceMatcher
.
Source code in databooks/data_models/notebook.py
def __sub__(
self: Cells[Cell], other: Cells[Cell]
) -> Cells[Tuple[List[Cell], List[Cell]]]:
"""Return the difference using `difflib.SequenceMatcher`."""
if type(self) != type(other):
raise TypeError(
f"Unsupported operand types for `-`: `{type(self).__name__}` and"
f" `{type(other).__name__}`"
)
_self = deepcopy(self)
_other = deepcopy(other)
for cells in (_self, _other):
for cell in cells:
cell.remove_fields(["id"], missing_ok=True)
# By setting the context to the max number of cells and using
# `pathlib.SequenceMatcher.get_grouped_opcodes` we essentially get the same
# result as `pathlib.SequenceMatcher.get_opcodes` but in smaller chunks
n_context = max(len(_self), len(_other))
diff_opcodes = list(
SequenceMatcher(
isjunk=None, a=_self, b=_other, autojunk=False
).get_grouped_opcodes(n_context)
)
if len(diff_opcodes) > 1:
raise RuntimeError(
"Expected one group for opcodes when context size is "
f" {n_context} for {len(_self)} and {len(_other)} cells in"
" notebooks."
)
return Cells[Tuple[List[Cell], List[Cell]]](
[
# https://github.com/python/mypy/issues/9459
tuple((_self.data[i1:j1], _other.data[i2:j2])) # type: ignore
for _, i1, j1, i2, j2 in chain.from_iterable(diff_opcodes)
]
)
resolve(self, *, keep_first_cells=None, first_id=None, last_id=None, **kwargs)
Resolve differences between databooks.data_models.notebook.Cells
.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
keep_first_cells |
Optional[bool] |
Whether to keep the cells of the first notebook or not. If |
None |
first_id |
Optional[str] |
Git hash of first file in conflict |
None |
last_id |
Optional[str] |
Git hash of last file in conflict |
None |
kwargs |
Any |
(Unused) keyword arguments to keep compatibility with |
{} |
Returns:
Type | Description |
---|---|
List[Cell] |
List of cells |
Source code in databooks/data_models/notebook.py
def resolve(
self: Cells[Tuple[List[Cell], List[Cell]]],
*,
keep_first_cells: Optional[bool] = None,
first_id: Optional[str] = None,
last_id: Optional[str] = None,
**kwargs: Any,
) -> List[Cell]:
"""
Resolve differences between `databooks.data_models.notebook.Cells`.
:param keep_first_cells: Whether to keep the cells of the first notebook or not.
If `None`, then keep both wrapping the git-diff tags
:param first_id: Git hash of first file in conflict
:param last_id: Git hash of last file in conflict
:param kwargs: (Unused) keyword arguments to keep compatibility with
`databooks.data_models.base.resolve`
:return: List of cells
"""
if keep_first_cells is not None:
return list(
chain.from_iterable(pairs[not keep_first_cells] for pairs in self.data)
)
return list(
chain.from_iterable(
Cells.wrap_git(
first_cells=val[0],
last_cells=val[1],
hash_first=first_id,
hash_last=last_id,
)
if val[0] != val[1]
else val[0]
for val in self.data
)
)
validate(v)
classmethod
Ensure object is custom defined container.
Source code in databooks/data_models/notebook.py
@classmethod
def validate(cls, v: List[T]) -> Cells[T]:
"""Ensure object is custom defined container."""
if not isinstance(v, cls):
return cls(v)
else:
return v
wrap_git(first_cells, last_cells, hash_first=None, hash_last=None)
classmethod
Wrap git-diff cells in existing notebook.
Source code in databooks/data_models/notebook.py
@classmethod
def wrap_git(
cls,
first_cells: List[Cell],
last_cells: List[Cell],
hash_first: Optional[str] = None,
hash_last: Optional[str] = None,
) -> List[Cell]:
"""Wrap git-diff cells in existing notebook."""
return (
[
Cell(
metadata=CellMetadata(git_hash=hash_first),
source=[f"`<<<<<<< {hash_first}`"],
cell_type="markdown",
)
]
+ first_cells
+ [
Cell(
source=["`=======`"],
cell_type="markdown",
metadata=CellMetadata(),
)
]
+ last_cells
+ [
Cell(
metadata=CellMetadata(git_hash=hash_last),
source=[f"`>>>>>>> {hash_last}`"],
cell_type="markdown",
)
]
)
Cells[Cell] (Cells)
pydantic-model
Config
getter_dict (Representation)
Hack to make object's smell just enough like dicts for validate_model.
We can't inherit from Mapping[str, Any] because it upsets cython so we have to implement all methods ourselves.
get_field_info(name)
classmethod
Get properties of FieldInfo from the fields
property of the config class.
json_dumps(obj, *, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, cls=None, indent=None, separators=None, default=None, sort_keys=False, **kw)
Serialize obj
to a JSON formatted str
.
If skipkeys
is true then dict
keys that are not basic types
(str
, int
, float
, bool
, None
) will be skipped
instead of raising a TypeError
.
If ensure_ascii
is false, then the return value can contain non-ASCII
characters if they appear in strings contained in obj
. Otherwise, all
such characters are escaped in JSON strings.
If check_circular
is false, then the circular reference check
for container types will be skipped and a circular reference will
result in an OverflowError
(or worse).
If allow_nan
is false, then it will be a ValueError
to
serialize out of range float
values (nan
, inf
, -inf
) in
strict compliance of the JSON specification, instead of using the
JavaScript equivalents (NaN
, Infinity
, -Infinity
).
If indent
is a non-negative integer, then JSON array elements and
object members will be pretty-printed with that indent level. An indent
level of 0 will only insert newlines. None
is the most compact
representation.
If specified, separators
should be an (item_separator, key_separator)
tuple. The default is (', ', ': ')
if indent is None
and
(',', ': ')
otherwise. To get the most compact JSON representation,
you should specify (',', ':')
to eliminate whitespace.
default(obj)
is a function that should return a serializable version
of obj or raise TypeError. The default simply raises TypeError.
If sort_keys is true (default: False
), then the output of
dictionaries will be sorted by key.
To use a custom JSONEncoder
subclass (e.g. one that overrides the
.default()
method to serialize additional types), specify it with
the cls
kwarg; otherwise JSONEncoder
is used.
Source code in databooks/data_models/notebook.py
def dumps(obj, *, skipkeys=False, ensure_ascii=True, check_circular=True,
allow_nan=True, cls=None, indent=None, separators=None,
default=None, sort_keys=False, **kw):
"""Serialize ``obj`` to a JSON formatted ``str``.
If ``skipkeys`` is true then ``dict`` keys that are not basic types
(``str``, ``int``, ``float``, ``bool``, ``None``) will be skipped
instead of raising a ``TypeError``.
If ``ensure_ascii`` is false, then the return value can contain non-ASCII
characters if they appear in strings contained in ``obj``. Otherwise, all
such characters are escaped in JSON strings.
If ``check_circular`` is false, then the circular reference check
for container types will be skipped and a circular reference will
result in an ``OverflowError`` (or worse).
If ``allow_nan`` is false, then it will be a ``ValueError`` to
serialize out of range ``float`` values (``nan``, ``inf``, ``-inf``) in
strict compliance of the JSON specification, instead of using the
JavaScript equivalents (``NaN``, ``Infinity``, ``-Infinity``).
If ``indent`` is a non-negative integer, then JSON array elements and
object members will be pretty-printed with that indent level. An indent
level of 0 will only insert newlines. ``None`` is the most compact
representation.
If specified, ``separators`` should be an ``(item_separator, key_separator)``
tuple. The default is ``(', ', ': ')`` if *indent* is ``None`` and
``(',', ': ')`` otherwise. To get the most compact JSON representation,
you should specify ``(',', ':')`` to eliminate whitespace.
``default(obj)`` is a function that should return a serializable version
of obj or raise TypeError. The default simply raises TypeError.
If *sort_keys* is true (default: ``False``), then the output of
dictionaries will be sorted by key.
To use a custom ``JSONEncoder`` subclass (e.g. one that overrides the
``.default()`` method to serialize additional types), specify it with
the ``cls`` kwarg; otherwise ``JSONEncoder`` is used.
"""
# cached encoder
if (not skipkeys and ensure_ascii and
check_circular and allow_nan and
cls is None and indent is None and separators is None and
default is None and not sort_keys and not kw):
return _default_encoder.encode(obj)
if cls is None:
cls = JSONEncoder
return cls(
skipkeys=skipkeys, ensure_ascii=ensure_ascii,
check_circular=check_circular, allow_nan=allow_nan, indent=indent,
separators=separators, default=default, sort_keys=sort_keys,
**kw).encode(obj)
json_loads(s, *, cls=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, object_pairs_hook=None, **kw)
Deserialize s
(a str
, bytes
or bytearray
instance
containing a JSON document) to a Python object.
object_hook
is an optional function that will be called with the
result of any object literal decode (a dict
). The return value of
object_hook
will be used instead of the dict
. This feature
can be used to implement custom decoders (e.g. JSON-RPC class hinting).
object_pairs_hook
is an optional function that will be called with the
result of any object literal decoded with an ordered list of pairs. The
return value of object_pairs_hook
will be used instead of the dict
.
This feature can be used to implement custom decoders. If object_hook
is also defined, the object_pairs_hook
takes priority.
parse_float
, if specified, will be called with the string
of every JSON float to be decoded. By default this is equivalent to
float(num_str). This can be used to use another datatype or parser
for JSON floats (e.g. decimal.Decimal).
parse_int
, if specified, will be called with the string
of every JSON int to be decoded. By default this is equivalent to
int(num_str). This can be used to use another datatype or parser
for JSON integers (e.g. float).
parse_constant
, if specified, will be called with one of the
following strings: -Infinity, Infinity, NaN.
This can be used to raise an exception if invalid JSON numbers
are encountered.
To use a custom JSONDecoder
subclass, specify it with the cls
kwarg; otherwise JSONDecoder
is used.
The encoding
argument is ignored and deprecated since Python 3.1.
Source code in databooks/data_models/notebook.py
def loads(s, *, cls=None, object_hook=None, parse_float=None,
parse_int=None, parse_constant=None, object_pairs_hook=None, **kw):
"""Deserialize ``s`` (a ``str``, ``bytes`` or ``bytearray`` instance
containing a JSON document) to a Python object.
``object_hook`` is an optional function that will be called with the
result of any object literal decode (a ``dict``). The return value of
``object_hook`` will be used instead of the ``dict``. This feature
can be used to implement custom decoders (e.g. JSON-RPC class hinting).
``object_pairs_hook`` is an optional function that will be called with the
result of any object literal decoded with an ordered list of pairs. The
return value of ``object_pairs_hook`` will be used instead of the ``dict``.
This feature can be used to implement custom decoders. If ``object_hook``
is also defined, the ``object_pairs_hook`` takes priority.
``parse_float``, if specified, will be called with the string
of every JSON float to be decoded. By default this is equivalent to
float(num_str). This can be used to use another datatype or parser
for JSON floats (e.g. decimal.Decimal).
``parse_int``, if specified, will be called with the string
of every JSON int to be decoded. By default this is equivalent to
int(num_str). This can be used to use another datatype or parser
for JSON integers (e.g. float).
``parse_constant``, if specified, will be called with one of the
following strings: -Infinity, Infinity, NaN.
This can be used to raise an exception if invalid JSON numbers
are encountered.
To use a custom ``JSONDecoder`` subclass, specify it with the ``cls``
kwarg; otherwise ``JSONDecoder`` is used.
The ``encoding`` argument is ignored and deprecated since Python 3.1.
"""
if isinstance(s, str):
if s.startswith('\ufeff'):
raise JSONDecodeError("Unexpected UTF-8 BOM (decode using utf-8-sig)",
s, 0)
else:
if not isinstance(s, (bytes, bytearray)):
raise TypeError(f'the JSON object must be str, bytes or bytearray, '
f'not {s.__class__.__name__}')
s = s.decode(detect_encoding(s), 'surrogatepass')
if "encoding" in kw:
import warnings
warnings.warn(
"'encoding' is ignored and deprecated. It will be removed in Python 3.9",
DeprecationWarning,
stacklevel=2
)
del kw['encoding']
if (cls is None and object_hook is None and
parse_int is None and parse_float is None and
parse_constant is None and object_pairs_hook is None and not kw):
return _default_decoder.decode(s)
if cls is None:
cls = JSONDecoder
if object_hook is not None:
kw['object_hook'] = object_hook
if object_pairs_hook is not None:
kw['object_pairs_hook'] = object_pairs_hook
if parse_float is not None:
kw['parse_float'] = parse_float
if parse_int is not None:
kw['parse_int'] = parse_int
if parse_constant is not None:
kw['parse_constant'] = parse_constant
return cls(**kw).decode(s)
prepare_field(field)
classmethod
Optional hook to check or modify fields during model creation.
JupyterNotebook (DatabooksBase)
pydantic-model
Jupyter notebook. Extra fields yield invalid notebook.
clear_metadata(self, *, notebook_metadata_keep=None, notebook_metadata_remove=None, **cell_kwargs)
Clear notebook and cell metadata.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
notebook_metadata_keep |
Sequence[str] |
Metadata values to keep - simply pass an empty sequence (i.e.: |
None |
notebook_metadata_remove |
Sequence[str] |
Metadata values to remove |
None |
cell_kwargs |
Any |
keyword arguments to be passed to each cell's |
{} |
Returns:
Type | Description |
---|---|
None |
Source code in databooks/data_models/notebook.py
def clear_metadata(
self,
*,
notebook_metadata_keep: Sequence[str] = None,
notebook_metadata_remove: Sequence[str] = None,
**cell_kwargs: Any,
) -> None:
"""
Clear notebook and cell metadata.
:param notebook_metadata_keep: Metadata values to keep - simply pass an empty
sequence (i.e.: `()`) to remove all extra fields.
:param notebook_metadata_remove: Metadata values to remove
:param cell_kwargs: keyword arguments to be passed to each cell's
`databooks.data_models.Cell.clear_metadata`
:return:
"""
nargs = sum(
(notebook_metadata_keep is not None, notebook_metadata_remove is not None)
)
if nargs != 1:
raise ValueError(
"Exactly one of `notebook_metadata_keep` or `notebook_metadata_remove`"
f" must be passed, got {nargs} arguments."
)
if notebook_metadata_keep is not None:
notebook_metadata_remove = tuple(
field
for field, _ in self.metadata
if field not in notebook_metadata_keep
)
self.metadata.remove_fields(notebook_metadata_remove) # type: ignore
if len(cell_kwargs) > 0:
_clean_cells = deepcopy(self.cells)
for cell in _clean_cells:
cell.clear_metadata(**cell_kwargs)
self.cells = _clean_cells
parse_file(path, **parse_kwargs)
classmethod
Parse notebook from a path.
Source code in databooks/data_models/notebook.py
@classmethod
def parse_file(cls, path: Path | str, **parse_kwargs: Any) -> JupyterNotebook:
"""Parse notebook from a path."""
content_arg = parse_kwargs.pop("content_type", None)
if content_arg is not None:
raise ValueError(
f"Value of `content_type` must be `json` (default), got `{content_arg}`"
)
return super(JupyterNotebook, cls).parse_file(
path=path, content_type="json", **parse_kwargs
)
NotebookMetadata (DatabooksBase)
pydantic-model
Notebook metadata. Empty by default but can accept extra fields.