From 4a2aed5bcf7e2ab26ea5e8ecf15e6adb3f87b628 Mon Sep 17 00:00:00 2001 From: Jason Streifling Date: Tue, 4 Feb 2025 17:48:54 +0100 Subject: [PATCH] Add possibilty to create table of contents for docx improted articles --- cmd/backend/create_toc.lua | 33 +++++++++++++++++++++++++++++++++ cmd/backend/docx.go | 5 ++++- 2 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 cmd/backend/create_toc.lua diff --git a/cmd/backend/create_toc.lua b/cmd/backend/create_toc.lua new file mode 100644 index 0000000..f08bf15 --- /dev/null +++ b/cmd/backend/create_toc.lua @@ -0,0 +1,33 @@ +-- Helper function: remove all image inlines from a list of inlines. +local function remove_images(inlines) + local result = {} + for _, item in ipairs(inlines) do + if item.t ~= "Image" then + table.insert(result, item) + end + end + return result +end + +-- Build a bullet list representing the table of contents. +local function build_toc(doc) + local toc_items = {} + for _, block in ipairs(doc.blocks) do + if block.t == "Header" then + local clean_inlines = remove_images(block.content) + local header_text = pandoc.utils.stringify(clean_inlines) + if header_text ~= "" then + local link = pandoc.Link(clean_inlines, "#" .. block.identifier) + table.insert(toc_items, { link }) + end + end + end + return pandoc.BulletList(toc_items) +end + +-- The Pandoc function runs after the document is fully constructed. +function Pandoc(doc) + local toc = build_toc(doc) + table.insert(doc.blocks, 1, toc) -- Insert the TOC at the very beginning of the document. + return doc +end diff --git a/cmd/backend/docx.go b/cmd/backend/docx.go index 5413c6f..597356b 100644 --- a/cmd/backend/docx.go +++ b/cmd/backend/docx.go @@ -21,7 +21,7 @@ func ConvertToMarkdown(c *Config, filename string) ([]byte, error) { defer os.RemoveAll(tmpDir) articleFileName := filepath.Join(os.TempDir(), fmt.Sprint(uuid.New(), ".md")) - cmd := exec.Command("pandoc", "-s", "-f", "docx", "-t", "commonmark_x", "-o", articleFileName, "--extract-media", tmpDir, filename) // TODO: Is writing to a file necessary? + cmd := exec.Command("pandoc", "-s", "--lua-filter=cmd/backend/create_toc.lua", "-f", "docx", "-t", "commonmark_x", "-o", articleFileName, "--extract-media", tmpDir, filename) // TODO: Is writing to a file necessary? cmd.Stderr = &stderr if err = cmd.Run(); err != nil { return nil, fmt.Errorf("error converting docx to markdown: %v: %v", err, stderr.String()) @@ -33,6 +33,9 @@ func ConvertToMarkdown(c *Config, filename string) ([]byte, error) { return nil, fmt.Errorf("error reading markdown file: %v", err) } + re := regexp.MustCompile(`\{width=[^}]+height=[^}]+\}`) + articleContent = re.ReplaceAll(articleContent, []byte("")) + imageNames, err := filepath.Glob(filepath.Join(tmpDir, "media", "*")) if err != nil { return nil, fmt.Errorf("error getting docx images from temporary directory: %v", err)