diff --git a/cmd/backend/docx.go b/cmd/backend/docx.go index 5413c6f..dcc8a2a 100644 --- a/cmd/backend/docx.go +++ b/cmd/backend/docx.go @@ -20,8 +20,9 @@ func ConvertToMarkdown(c *Config, filename string) ([]byte, error) { } defer os.RemoveAll(tmpDir) + // --toc articleFileName := filepath.Join(os.TempDir(), fmt.Sprint(uuid.New(), ".md")) - cmd := exec.Command("pandoc", "-s", "-f", "docx", "-t", "commonmark_x", "-o", articleFileName, "--extract-media", tmpDir, filename) // TODO: Is writing to a file necessary? + cmd := exec.Command("pandoc", "-s", "--lua-filter=cmd/backend/remove-header-images.lua", "-f", "docx", "-t", "commonmark_x", "-o", articleFileName, "--extract-media", tmpDir, filename) // TODO: Is writing to a file necessary? cmd.Stderr = &stderr if err = cmd.Run(); err != nil { return nil, fmt.Errorf("error converting docx to markdown: %v: %v", err, stderr.String()) @@ -33,6 +34,9 @@ func ConvertToMarkdown(c *Config, filename string) ([]byte, error) { return nil, fmt.Errorf("error reading markdown file: %v", err) } + re := regexp.MustCompile(`\{width=[^}]+height=[^}]+\}`) + articleContent = re.ReplaceAll(articleContent, []byte("")) + imageNames, err := filepath.Glob(filepath.Join(tmpDir, "media", "*")) if err != nil { return nil, fmt.Errorf("error getting docx images from temporary directory: %v", err) diff --git a/cmd/backend/remove-header-images.lua b/cmd/backend/remove-header-images.lua new file mode 100644 index 0000000..e705ee7 --- /dev/null +++ b/cmd/backend/remove-header-images.lua @@ -0,0 +1,42 @@ +-- custom-toc.lua +-- This filter builds a custom TOC without images in the link text, +-- leaving the actual header content (with images) untouched. + +-- Helper function: remove all image inlines from a list of inlines. +local function remove_images(inlines) + local result = {} + for _, item in ipairs(inlines) do + if item.t ~= "Image" then + table.insert(result, item) + end + end + return result +end + +-- Build a bullet list representing the table of contents. +local function build_toc(doc) + local toc_items = {} + for _, block in ipairs(doc.blocks) do + if block.t == "Header" then + -- Remove images from the header content to use for the TOC. + local clean_inlines = remove_images(block.content) + local header_text = pandoc.utils.stringify(clean_inlines) + if header_text ~= "" then + -- Create a link using the cleaned inlines and the header’s identifier. + local link = pandoc.Link(clean_inlines, "#" .. block.identifier) + table.insert(toc_items, { link }) + end + end + end + return pandoc.BulletList(toc_items) +end + +-- The Pandoc function runs after the document is fully constructed. +function Pandoc(doc) + -- Build the custom TOC. + local toc = build_toc(doc) + -- Insert the TOC at the very beginning of the document. + table.insert(doc.blocks, 1, toc) + return doc +end +