{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "437f8866", "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "import json\n", "import hashlib\n", "\n", "## Parameter\n", "\n", "IN_FOLDER = './Library/'\n" ] }, { "cell_type": "code", "execution_count": null, "id": "1e0f01fd", "metadata": {}, "outputs": [], "source": [ "## Helper Functions\n", "def list_files_pathlib(path=Path('.')):\n", " files = []\n", " for entry in path.iterdir():\n", " if entry.is_file():\n", " files.append(entry)\n", " elif entry.is_dir():\n", " files.extend(list_files_pathlib(entry))\n", " return files\n", "\n", "# Specify the directory path you want to start from\n", "\n", "file_list = list_files_pathlib(Path(IN_FOLDER))" ] }, { "cell_type": "code", "execution_count": null, "id": "fe452ca8", "metadata": {}, "outputs": [], "source": [ "data_files = [i for i in file_list if i.suffix == '.rm']\n", "meta_data_files = [i for i in file_list if i.suffix == '.metadata']" ] }, { "cell_type": "code", "execution_count": null, "id": "73c6e17a", "metadata": {}, "outputs": [], "source": [ "meta_data_files" ] }, { "cell_type": "code", "execution_count": null, "id": "0a3b5926", "metadata": {}, "outputs": [], "source": [ "[i for i in file_list if '03ec4477-8df9-49c1-96e4-dfca75f32623' in str(i)]" ] }, { "cell_type": "code", "execution_count": null, "id": "b04fdfa2", "metadata": {}, "outputs": [], "source": [ "## scripts to build document structure\n", "\n", "\n", "## Todo: check which documents needs updates via hashes\n", "\n", "\n", "def parse_document_data(filename):\n", " data = json.load(open(filename))\n", " page_list = data['cPages']['pages']\n", " pages_data = []\n", " for element in page_list:\n", " page_data = {\n", " 'page_id': element['id'],\n", " 'filename': filename.parent / filename.stem / element['id'], # construct path to subpages,\n", " }\n", " pages_data.append(page_data)\n", " return pages_data\n", "\n", "\n", "\n", "def parse_metadata(filename):\n", " data = json.load(open(filename))\n", " content_filename = filename.parent / str(filename.stem + '.content')\n", " return {\n", " 'last_modified': data['lastModified'],\n", " 'document_name': data['visibleName'],\n", " 'document_type': data['type'],\n", " 'parent_folder': data['parent'],\n", " 'page_ids': parse_document_data(content_filename)\n", " }\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "cb699ffa", "metadata": {}, "outputs": [], "source": [ "#\n", "# parse_metadata('fc2e4b57-6ce4-48ae-8452-b6d7873967aa.metadata')\n", "\n", "test = parse_metadata(meta_data_files[12])" ] }, { "cell_type": "code", "execution_count": null, "id": "b1deb665", "metadata": {}, "outputs": [], "source": [ "test" ] }, { "cell_type": "code", "execution_count": null, "id": "31e993f7", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Py2025", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.10" } }, "nbformat": 4, "nbformat_minor": 5 }