branch: externals/org-transclusion commit 51623cbc0b23c6ed57c25ce6861fa1ecf2a92805 Author: Joseph Turner <jos...@breatheoutbreathe.in> Commit: Joseph Turner <jos...@breatheoutbreathe.in>
Add option to transclude local HTML content rendered as Org With org-transclusion-http added to org-transclusion-set-extensions, org-transclusion will now convert HTML files to Org using Pandoc and render their content in a transclusion. This commit also removes the networking logic for retrieving HTML files over a network. That functionality will be handled by a separate package. --- org-transclusion-html.el | 68 ++++++++++++++++++++++++-- org-transclusion-http.el | 105 ----------------------------------------- org-transclusion.el | 5 +- test/org-transclusion-html.org | 2 + test/org-transclusion-http.org | 17 ------- test/source-html-no-ext | 17 +++++++ 6 files changed, 85 insertions(+), 129 deletions(-) diff --git a/org-transclusion-html.el b/org-transclusion-html.el index d267b8b46d..f586c829fb 100644 --- a/org-transclusion-html.el +++ b/org-transclusion-html.el @@ -18,10 +18,14 @@ ;;; Commentary: -;; This file contains functionality related to converting HTML content to Org. Features include: -;; -;; - Convert HTML to Org using Pandoc, a lá `org-web-tools' -;; + Convert only HTML headings matching link anchor +;; This is an extension to `org-transclusion'. When active, it enables +;; transclusion of HTML files by converting HTML to Org with Pandoc. +;; When a link anchor is specified only the HTML headings matching are +;; transcluded. Does not support live-sync. + +;; Requires Pandoc to be installed and in the $PATH. Conversion of +;; HTML to Org using Pandoc inspired by `org-web-tools'. + ;;; Code: @@ -30,13 +34,67 @@ (require 'org) (require 'cl-lib) (require 'pcase) +(require 'dom) + +;;;; Hook into org-transclusion + +(add-hook 'org-transclusion-add-functions #'org-transclusion-add-html-file) ;;;; Functions +;;;;; Add HTML file + +(defun org-transclusion-add-html-file (link plist) + "Return a list for HTML file LINK object and PLIST. +Return nil if not found." + (and (string= "file" (org-element-property :type link)) + (or (string-suffix-p ".html" (org-element-property :path link)) + (with-current-buffer (find-file-noselect + (org-element-property :path link) t) + (org-transclusion-html--html-p (current-buffer)))) + (append '(:tc-type "html-org-file") + (org-transclusion-html-org-file-content link plist)))) + +(defun org-transclusion-html-org-file-content (link _plist) + "Return payload list without :tc-type. +:src-content value will be Org format converted from HTML at LINK." + (let* ((path (org-element-property :path link)) + (html-buf (find-file-noselect path t)) + (org-buf + (generate-new-buffer + (format " *org-transclusion-html-org %s*" (expand-file-name path)))) + (src-content + (with-current-buffer org-buf + (insert-buffer-substring html-buf) + ;; TODO: It's not currently possible to link an HTML + ;; anchor inside of a 'file:' Org link, but if it ever + ;; becomes possible, we can use this: + + ;; (let ((dom (with-current-buffer html-buf + ;; (libxml-parse-html-region)))) + ;; (when (dom-by-id dom (format "\\`%s\\'" target)) + ;; ;; Page contains id element matching link target. + ;; (erase-buffer) + ;; (dom-print (org-transclusion-html--target-content dom target)))) + (org-transclusion--insert-org-from-html-with-pandoc) + (buffer-string)))) + (with-current-buffer html-buf + (org-with-wide-buffer + (list :src-buf (current-buffer) + :src-beg (point-min) + :src-end (point-max) + :src-content src-content))))) + +;;;;; Utilities + (defun org-transclusion-html--target-content (dom target) "Return DOM element(s) that correspond to the TARGET. Since anchors may refer to headings but not the text following -the heading, this function may not return the expected element." +the heading, this function may not return the expected element. + +While is not possible to specify an HTML anchor in a file: Org +link, this function is useful in other libraries for transcluding +sections of HTML documents linked via http://, hyper://, etc.." ;; HTML link fragments (targets) point to a specific point in a document, ;; not a range of text. This function attempts to guess what range of ;; text a target refers to based on what HTML element is targeted. diff --git a/org-transclusion-http.el b/org-transclusion-http.el deleted file mode 100644 index 36d678b2f7..0000000000 --- a/org-transclusion-http.el +++ /dev/null @@ -1,105 +0,0 @@ -;;; org-transclusion-http.el --- Transclude over HTTP -*- lexical-binding: t; -*- - -;; Copyright (C) 2024 Free Software Foundation, Inc. - -;; This program is free software; you can redistribute it and/or -;; modify it under the terms of the GNU Affero General Public License -;; as published by the Free Software Foundation, either version 3 of -;; the License, or (at your option) any later version. - -;; This program is distributed in the hope that it will be useful, but -;; WITHOUT ANY WARRANTY; without even the implied warranty of -;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;; Affero General Public License for more details. - -;; You should have received a copy of the GNU Affero General Public -;; License along with this program. If not, see -;; <https://www.gnu.org/licenses/>. - -;;; Commentary: - -;; This file contains functionality related to transcluding content over HTTP -;; using plz.el. Features include: -;; -;; - Transclude plain text -;; + Transclude only Org headings matching search options -;; - Transclude HTML converted to Org using Pandoc, a lá `org-web-tools' -;; + Transclude only HTML headings matching link anchor -;; - TODO: Support :lines - -;;; Code: - -;;;; Requirements - -(require 'org) -(require 'org-element) -(require 'org-transclusion) -(require 'org-transclusion-html) -(require 'cl-lib) -(require 'pcase) -(require 'plz) - -;;;; Functions - -(defun org-transclusion-http-add-file (link _plist) - "Return callback function when HTTP transclusion is appropriate. -Otherwise, return nil. Intended to be added to -`org-transclusion-add-functions', which see for descriptions of -arguments LINK and PLIST." - (pcase (org-element-property :type link) - ((or "http" "https") - (message "Asynchronously transcluding over HTTP at point %d, line %d..." - (point) (org-current-line)) - #'org-transclusion-http-add-callback))) - -(add-hook 'org-transclusion-add-functions #'org-transclusion-http-add-file) - -(defun org-transclusion-http-add-callback (link plist copy) - "Load HTTP file at LINK and call -`org-transclusion-add-callback' with PAYLOAD, LINK, PLIST, COPY." - (pcase-let* ((target-mkr (point-marker)) - (url (org-element-property :raw-link link)) - ((cl-struct url filename target) (url-generic-parse-url url)) - (tc-type)) - (plz 'get url - :as 'buffer - :then - (lambda (_response-buffer) - (when-let ((target-buf (marker-buffer target-mkr))) - (cond - ((org-transclusion-html--html-p (current-buffer)) ; HTML - (let ((dom (libxml-parse-html-region))) - (when (dom-by-id dom (format "\\`%s\\'" target)) - ;; Page contains id element matching link target. - (erase-buffer) - (dom-print (org-transclusion-html--target-content dom target))) - (org-transclusion--insert-org-from-html-with-pandoc) - ;; Use "org-http" `tc-type' since HTML is converted to Org mode. - (setf tc-type "org-http"))) - ((org-transclusion-org-file-p filename) ; Org-mode - ;; FIXME: filename may contain a query string, so it may not end - ;; with "org" or "org.gpg". For example, - ;; https://example.com/foobar.org?query=answer has the filename - ;; /foobar.org?query=answer and therefore doesn't match. - (when target - (org-mode) - (let ((org-link-search-must-match-exact-headline t)) - (when (with-demoted-errors "org-transclusion-http error:\n%s\ntranscluding whole file..." - (org-link-search (format "#%s" target))) - (org-narrow-to-subtree)))) - (setf tc-type "org-http")) - (t ; All other file types - (setf tc-type "others-http"))) - (let* ((payload-without-type - (org-transclusion-content-org-buffer-or-element nil plist)) - (payload (append `(:tc-type ,tc-type) payload-without-type))) - (with-current-buffer target-buf - (org-with-wide-buffer - (goto-char (marker-position target-mkr)) - (org-transclusion-add-callback payload link plist copy))))))))) - -;;;; Footer - -(provide 'org-transclusion-http) - -;;; org-transclusion-http.el ends here diff --git a/org-transclusion.el b/org-transclusion.el index 9797881512..ccd399694a 100644 --- a/org-transclusion.el +++ b/org-transclusion.el @@ -23,7 +23,7 @@ ;; Keywords: org-mode, transclusion, writing ;; Version: 1.3.2 -;; Package-Requires: ((emacs "27.1") (org "9.4") (plz "0.7.2")) +;; Package-Requires: ((emacs "27.1") (org "9.4")) ;; This file is not part of GNU Emacs. @@ -68,7 +68,8 @@ Intended for :set property for `customize'." (const :tag "font-lock: Add font-lock for Org-transclusion" org-transclusion-font-lock) (const :tag "indent-mode: Support org-indent-mode" org-transclusion-indent-mode) - (const :tag "http: Transclude content over HTTP" org-transclusion-http) + (const :tag "html: Transclude HTML converted to Org with Pandoc" + org-transclusion-http) (repeat :tag "Other packages" :inline t (symbol :tag "Package")))) (defcustom org-transclusion-add-all-on-activate t diff --git a/test/org-transclusion-html.org b/test/org-transclusion-html.org new file mode 100644 index 0000000000..ca0a438dd6 --- /dev/null +++ b/test/org-transclusion-html.org @@ -0,0 +1,2 @@ +[[file:source-html-no-ext]] +#+transclude: [[file:source-html-no-ext]] diff --git a/test/org-transclusion-http.org b/test/org-transclusion-http.org deleted file mode 100644 index a57a2a4608..0000000000 --- a/test/org-transclusion-http.org +++ /dev/null @@ -1,17 +0,0 @@ -# id="care" points to <h2>: Transclude <h2> and content after <h2> also. -#+transclude: [[https://ushin.org/needs-list.html#care]] - -# Org file with heading CUSTOM_ID: "care": Transclude only that heading. -#+transclude: [[https://ushin.org/needs-list.org#care]] - -# Nonexistent target in Org file: Transclude entire file. -#+transclude: [[https://ushin.org/needs-list.org#nonexistent-target]] - -# Nonexistent target in HTML file: Transclude entire file as Org. -#+transclude: [[https://ushin.org/needs-list.html#nonexistent-target]] - -# id="jabber" points to <section>: Transclude only <section> content. -#+transclude: [[https://jmp.chat/faq#jabber]] - -# id="autocapitalize" points to <dt>: Transclude <dt> and subsequent <dd>. -#+transclude: [[https://developer.mozilla.org/en-US/docs/Web/HTML/Element/textarea#autocapitalize]] diff --git a/test/source-html-no-ext b/test/source-html-no-ext new file mode 100644 index 0000000000..d8a1cbd51e --- /dev/null +++ b/test/source-html-no-ext @@ -0,0 +1,17 @@ +<?xml version="1.0" encoding="utf-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"> + <body> + <div id="content" class="content"> + <div id="outline-container-orgf29e149" class="outline-2"> + <h2 id="orgf29e149">hi there</h2> + </div> + </div> + <div id="postamble" class="status"> + <p class="author">Author: Joseph Turner</p> + <p class="date">Created: 2024-03-30 Sat 00:11</p> + <p class="validation"><a href="https://validator.w3.org/check?uri=referer">Validate</a></p> + </div> + </body> +</html>