Coverage for databooks/metadata.py: 94%

31 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-11-09 13:11 +0000

1"""Metadata wrapper functions for cleaning notebook metadata.""" 

2from pathlib import Path 

3from typing import Any, Callable, List, Optional, Sequence 

4 

5from databooks import JupyterNotebook 

6from databooks.data_models.cell import BaseCell 

7from databooks.logging import get_logger, set_verbose 

8 

9logger = get_logger(__file__) 

10 

11 

12def clear( 

13 read_path: Path, 

14 write_path: Optional[Path] = None, 

15 notebook_metadata_keep: Sequence[str] = (), 

16 cell_metadata_keep: Sequence[str] = (), 

17 cell_fields_keep: Sequence[str] = (), 

18 check: bool = False, 

19 verbose: bool = False, 

20 overwrite: bool = False, 

21 **kwargs: Any, 

22) -> bool: 

23 """ 

24 Clear Jupyter Notebook metadata. 

25 

26 Clear metadata (at notebook and cell level) and write clean 

27 notebook. By default, remove all metadata. 

28 :param read_path: Path of notebook file with metadata to be cleaned 

29 :param write_path: Path of notebook file with metadata to be cleaned 

30 :param notebook_metadata_keep: Notebook metadata fields to keep 

31 :param cell_metadata_keep: Cell metadata fields to keep 

32 :param cell_fields_keep: Cell fields to keep 

33 :param check: Don't write any files, check whether there is unwanted metadata 

34 :param verbose: Log written files 

35 :param overwrite: Whether to overwrite files (if exists) 

36 :param kwargs: Additional keyword arguments to pass to 

37 `databooks.data_models.JupyterNotebook.clear_metadata` 

38 :return: Whether notebooks are equal 

39 """ 

40 if verbose: 

41 set_verbose(logger) 

42 

43 if write_path is None: 

44 write_path = read_path 

45 notebook = JupyterNotebook.parse_file(read_path) 

46 

47 # Get fields to remove from cells and keep notebook schema 

48 cell_fields = {field for cell in notebook.cells for field, _ in cell if field} 

49 cell_fields_keep = list(cell_fields_keep) + list(BaseCell.__fields__) 

50 

51 cell_remove_fields = [ 

52 field for field in cell_fields if field not in cell_fields_keep 

53 ] 

54 

55 notebook.clear_metadata( 

56 notebook_metadata_keep=notebook_metadata_keep, 

57 cell_metadata_keep=cell_metadata_keep, 

58 cell_remove_fields=cell_remove_fields, 

59 **kwargs, 

60 ) 

61 nb_equals = notebook == JupyterNotebook.parse_file(read_path) 

62 

63 if nb_equals or check: 

64 msg = ( 

65 "no metadata to remove." 

66 if nb_equals 

67 else "only check (unwanted metadata found)." 

68 ) 

69 logger.debug(f"No action taken for {read_path} - {msg}") 

70 else: 

71 notebook.write(path=write_path, overwrite=overwrite) 

72 logger.debug(f"Removed metadata from {read_path}, saved as {write_path}") 

73 

74 return nb_equals 

75 

76 

77def clear_all( 

78 read_paths: List[Path], 

79 write_paths: List[Path], 

80 *, 

81 progress_callback: Callable[[], None] = lambda: None, 

82 **clear_kwargs: Any, 

83) -> List[bool]: 

84 """ 

85 Clear metadata for multiple notebooks at notebooks and cell level. 

86 

87 :param read_paths: Paths of notebook to remove metadata 

88 :param write_paths: Paths of where to write cleaned notebooks 

89 :param progress_callback: Callback function to report progress 

90 :param clear_kwargs: Keyword arguments to be passed to `databooks.metadata.clear` 

91 :return: Whether the notebooks contained or not unwanted metadata 

92 """ 

93 if len(read_paths) != len(write_paths): 

94 raise ValueError( 

95 "Read and write paths must have same length." 

96 f" Got {len(read_paths)} and {len(write_paths)}" 

97 ) 

98 checks = [] 

99 for nb_path, write_path in zip(read_paths, write_paths): 

100 checks.append(clear(read_path=nb_path, write_path=write_path, **clear_kwargs)) 

101 progress_callback() 

102 return checks