branch: externals/org-transclusion commit 85623b8ae7907df21dc8c9efdab9090bf8af1aa7 Author: Joseph Turner <jos...@breatheoutbreathe.in> Commit: Joseph Turner <jos...@breatheoutbreathe.in>
Add org-transclusion-http.el: Transclude content over HTTP --- org-transclusion-http.el | 227 +++++++++++++++++++++++++++++++++++++++++ org-transclusion.el | 3 +- test/org-transclusion-http.org | 17 +++ 3 files changed, 246 insertions(+), 1 deletion(-) diff --git a/org-transclusion-http.el b/org-transclusion-http.el new file mode 100644 index 0000000000..3807ac2883 --- /dev/null +++ b/org-transclusion-http.el @@ -0,0 +1,227 @@ +;;; org-transclusion-http.el --- Transclude over HTTP -*- lexical-binding: t; -*- + +;; Copyright (C) 2024 Free Software Foundation, Inc. + +;; This program is free software; you can redistribute it and/or +;; modify it under the terms of the GNU Affero General Public License +;; as published by the Free Software Foundation, either version 3 of +;; the License, or (at your option) any later version. + +;; This program is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; Affero General Public License for more details. + +;; You should have received a copy of the GNU Affero General Public +;; License along with this program. If not, see +;; <https://www.gnu.org/licenses/>. + +;;; Commentary: + +;; This file contains functionality related to transcluding content over HTTP +;; using plz.el. Features include: +;; +;; - Transclude plain text +;; + Transclude only Org headings matching search options +;; - Transclude HTML converted to Org using Pandoc, a lá `org-web-tools' +;; + Transclude only HTML headings matching link anchor +;; - TODO: Support :lines + +;;; Code: + +;;;; Requirements + +(require 'org) +(require 'org-element) +(require 'org-transclusion) +(require 'cl-lib) +(require 'pcase) +(require 'plz) + +;;;; Functions + +(defun org-transclusion-http-add-file (link _plist) + "Return callback function when HTTP transclusion is appropriate. +Otherwise, return nil. Intended to be added to +`org-transclusion-add-functions', which see for descriptions of +arguments LINK and PLIST." + (pcase (org-element-property :type link) + ((or "http" "https") + (message "Asynchronously transcluding over HTTP at point %d, line %d..." + (point) (org-current-line)) + #'org-transclusion-http-add-callback))) + +(add-hook 'org-transclusion-add-functions #'org-transclusion-http-add-file) + +(defun org-transclusion-http-add-callback (link plist copy) + "Load HTTP file at LINK and call +`org-transclusion-add-callback' with PAYLOAD, LINK, PLIST, COPY." + (pcase-let* ((target-mkr (point-marker)) + (url (org-element-property :raw-link link)) + ((cl-struct url filename target) (url-generic-parse-url url)) + (tc-type)) + (plz 'get url + :as 'buffer + :then + (lambda (_response-buffer) + (when-let ((target-buf (marker-buffer target-mkr))) + (cond + ((org-transclusion-http-html-p (current-buffer)) ; HTML + (let ((dom (libxml-parse-html-region))) + (when (dom-by-id dom (format "\\`%s\\'" target)) + ;; Page contains id element matching link target. + (erase-buffer) + (dom-print (org-transclusion-http--target-content dom target))) + (org-transclusion--insert-org-from-html-with-pandoc) + ;; Use "org-http" `tc-type' since HTML is converted to Org mode. + (setf tc-type "org-http"))) + ((org-transclusion-org-file-p filename) ; Org-mode + ;; FIXME: filename may contain a query string, so it may not end + ;; with "org" or "org.gpg". For example, + ;; https://example.com/foobar.org?query=answer has the filename + ;; /foobar.org?query=answer and therefore doesn't match. + (when target + (org-mode) + (let ((org-link-search-must-match-exact-headline t)) + (when (with-demoted-errors "org-transclusion-http error:\n%s\ntranscluding whole file..." + (org-link-search (format "#%s" target))) + (org-narrow-to-subtree)))) + (setf tc-type "org-http")) + (t ; All other file types + (setf tc-type "others-http"))) + (let* ((payload-without-type + (org-transclusion-content-org-buffer-or-element nil plist)) + (payload (append `(:tc-type ,tc-type) payload-without-type))) + (with-current-buffer target-buf + (org-with-wide-buffer + (goto-char (marker-position target-mkr)) + (org-transclusion-add-callback payload link plist copy))))))))) + +(defun org-transclusion-http--target-content (dom target) + "Return DOM element(s) that correspond to the TARGET. +Since anchors may refer to headings but not the text following +the heading, this function may not return the expected element." + ;; HTML link fragments (targets) point to a specific point in a document, + ;; not a range of text. This function attempts to guess what range of + ;; text a target refers to based on what HTML element is targeted. + ;; See <https://github.com/alphapapa/org-web-tools/issues/72>. + (let ((id-element (car (dom-by-id dom (format "\\`%s\\'" target))))) + (pcase (car id-element) + ((and (or 'h1 'h2 'h3 'h4 'h5 'h6) + target-heading) + ;; If the HTML element is a heading, include it and subsequent + ;; sibling elements until next heading of same level or higher. + (let* ((siblings (dom-children (dom-parent dom id-element))) + (heading-position (cl-position id-element siblings)) + (next-heading-position + (cl-position + nil siblings + :start (1+ heading-position) + :test (lambda (_a b) + (and (not (stringp b)) + (pcase (car b) + ((and (or 'h1 'h2 'h3 'h4 'h5 'h6) + subsequent-heading) + (not (string> + (symbol-name target-heading) + (symbol-name subsequent-heading)))))))))) + (append '(div ()) ; Wrap in div so all elements are rendered + (cl-subseq siblings heading-position + (when next-heading-position + (1+ next-heading-position)))))) + ('dt + ;; Include <dt> and subsequent <dd> element. + ;; TODO: Consider using next-sibling combinator with + ;; `esxml-query' once it's supported. + (let* ((siblings (dom-children (dom-parent dom id-element))) + (dt-position (cl-position id-element siblings)) + (subsequent-dd-position + (cl-position + nil siblings + :start (1+ dt-position) + :test (lambda (_a b) (and (not (stringp b)) + (eq 'dd (car b))))))) + (append '(div ()) ; Wrap in div so all elements are rendered + (cl-subseq siblings dt-position + (when subsequent-dd-position + (1+ subsequent-dd-position)))))) + ('nil ; Invalid target: Return whole dom. + dom) + (_ ; Any other valid target: Return it. + id-element)))) + +;;;;; Helpers + +(defun org-transclusion-http-html-p (buffer) + "Return non-nil if BUFFER is visiting an HTML file." + (with-current-buffer buffer + (save-excursion + (goto-char (point-min)) + ;; Assume DOCTYPE is within the first 5 lines + (search-forward "!DOCTYPE html" (pos-eol 5) t)))) + +;;;;;; Copied/Adapted from `org-web-tools' + +(defun org-transclusion--insert-org-from-html-with-pandoc (&optional buffer) + "Replace current HTML contents of BUFFER with Org with Pandoc. +When nil, BUFFER defaults to current buffer." + ;; Based on `org-web-tools--html-to-org-with-pandoc'. + (with-current-buffer (or buffer (current-buffer)) + (unless (zerop (call-process-region + (point-min) (point-max) "pandoc" t t nil + "--wrap=none" "-f" "html-raw_html-native_divs" "-t" "org")) + ;; TODO: Add error output, see org-protocol-capture-html + (error "Pandoc failed")) + (org-mode) + (org-transclusion--clean-pandoc-output))) + +(defun org-transclusion--clean-pandoc-output () + "Remove unwanted things in current buffer of Pandoc output." + (org-transclusion--remove-bad-characters) + (org-transclusion--remove-html-blocks) + (org-transclusion--remove-custom_id_properties)) + +(defun org-transclusion--remove-bad-characters () + "Remove unwanted characters from current buffer." + (save-excursion + (cl-loop for (re . replacement) in '((" " . "")) + do (progn + (goto-char (point-min)) + (while (re-search-forward re nil t) + (replace-match replacement)))))) + +(defun org-transclusion--remove-html-blocks () + "Remove \"#+BEGIN_HTML...#+END_HTML\" blocks from current buffer." + (save-excursion + (goto-char (point-min)) + (while (re-search-forward (rx (optional "\n") + "#+BEGIN_HTML" + (minimal-match (1+ anything)) + "#+END_HTML" + (optional "\n")) + nil t) + (replace-match "")))) + +(defun org-transclusion--remove-custom_id_properties () + "Remove property drawers containing CUSTOM_ID properties. +This is a blunt instrument: any drawer containing the CUSTOM_ID +property is removed, regardless of other properties it may +contain. This seems to be the best course of action in current +Pandoc output." + (let ((regexp (org-re-property "CUSTOM_ID" nil nil))) + (save-excursion + (goto-char (point-min)) + (while (re-search-forward regexp nil t) + (when (org-at-property-p) + (org-back-to-heading) + ;; As a minor optimization, we don't bound the search to the current + ;; entry. Unless the current property drawer is malformed, which + ;; shouldn't happen in Pandoc output, it should work. + (re-search-forward org-property-drawer-re) + (setf (buffer-substring (match-beginning 0) (match-end 0)) "")))))) + +;;;; Footer + +(provide 'org-transclusion-http) + +;;; org-transclusion-http.el ends here diff --git a/org-transclusion.el b/org-transclusion.el index 4a2bf27a4a..14b81290e1 100644 --- a/org-transclusion.el +++ b/org-transclusion.el @@ -23,7 +23,7 @@ ;; Keywords: org-mode, transclusion, writing ;; Version: 1.3.2 -;; Package-Requires: ((emacs "27.1") (org "9.4")) +;; Package-Requires: ((emacs "27.1") (org "9.4") (plz "0.7.2")) ;; This file is not part of GNU Emacs. @@ -68,6 +68,7 @@ Intended for :set property for `customize'." (const :tag "font-lock: Add font-lock for Org-transclusion" org-transclusion-font-lock) (const :tag "indent-mode: Support org-indent-mode" org-transclusion-indent-mode) + (const :tag "http: Transclude content over HTTP" org-transclusion-http) (repeat :tag "Other packages" :inline t (symbol :tag "Package")))) (defcustom org-transclusion-add-all-on-activate t diff --git a/test/org-transclusion-http.org b/test/org-transclusion-http.org new file mode 100644 index 0000000000..a57a2a4608 --- /dev/null +++ b/test/org-transclusion-http.org @@ -0,0 +1,17 @@ +# id="care" points to <h2>: Transclude <h2> and content after <h2> also. +#+transclude: [[https://ushin.org/needs-list.html#care]] + +# Org file with heading CUSTOM_ID: "care": Transclude only that heading. +#+transclude: [[https://ushin.org/needs-list.org#care]] + +# Nonexistent target in Org file: Transclude entire file. +#+transclude: [[https://ushin.org/needs-list.org#nonexistent-target]] + +# Nonexistent target in HTML file: Transclude entire file as Org. +#+transclude: [[https://ushin.org/needs-list.html#nonexistent-target]] + +# id="jabber" points to <section>: Transclude only <section> content. +#+transclude: [[https://jmp.chat/faq#jabber]] + +# id="autocapitalize" points to <dt>: Transclude <dt> and subsequent <dd>. +#+transclude: [[https://developer.mozilla.org/en-US/docs/Web/HTML/Element/textarea#autocapitalize]]