branch: externals/doc-toc commit 05b6d034aad8ccb7fe3ca9fdcfdbf543b1cd0277 Author: Daniel Nicolai <dalanico...@gmail.com> Commit: Daniel Nicolai <dalanico...@gmail.com>
Fix djvu/pdf hard dependency (github issue #3) --- README.org | 31 ++++++++++++++++++------------- toc-mode.el | 30 ++++++++++++++++++++++-------- 2 files changed, 40 insertions(+), 21 deletions(-) diff --git a/README.org b/README.org index aea75eaac6..c2e347e7dd 100644 --- a/README.org +++ b/README.org @@ -38,6 +38,11 @@ Extraction and adding contents to a document is done in 4 steps: 3. adjust/correct pagenumbers 4. add TOC to document +In each step below, check out available shortcuts using =C-h m=. Additionally you +can find available functions by typing the =M-x mode-name= (e.g. =M-x toc-cleanup=), +or with two dashes in the mode name (e.g. =M-x toc--cleanup=). Of course if you +use packages like Ivy or Helm you just use the fuzzy search functionality. + ** 1. Extraction Open some pdf or djvu file in Emacs (pdf-tools and djvu package recommended). Find the pagenumbers for the TOC. Then type =M-x toc-extract-pages=, or =M-x @@ -51,12 +56,12 @@ data). Also the languages used for tesseract OCR can be customized via the [[toc-mode-extract.gif]] A buffer with the, somewhat cleaned up, extracted text will open in TOC-cleanup -mode. Prefix command with the universal argument (=C-u=) to omit clean and get the -raw text. If the extracted text is of too low quality you either can hack/extend -the [[help:toc-extract-pages-ocr][toc-extract-pages-ocr]] definition, or alternatively you can try to extract -the text with the [[https://pypi.org/project/document-contents-extractor/][python document-contents-extractor script]], which is more -configurable (you are also welcome to hack on and improve that script). -For this the [[https://tesseract-ocr.github.io/tessdoc/Command-Line-Usage.html][tesseract]] documentation might be useful. +mode. Prefix command with the universal argument (=C-u=) to omit cleanup and get +the raw text. If the extracted text is of too low quality you either can +hack/extend the [[help:toc-extract-pages-ocr][toc-extract-pages-ocr]] definition, or alternatively you can try +to extract the text with the [[https://pypi.org/project/document-contents-extractor/][python document-contents-extractor script]], which is +more configurable (you are also welcome to hack on and improve that script). For +this the [[https://tesseract-ocr.github.io/tessdoc/Command-Line-Usage.html][tesseract]] documentation might be useful. If you merely want to extract text without further processing then you can use the command [[help:toc-extract-only][toc-extract-only]]. @@ -126,13 +131,13 @@ directory or an absolute path can be given.) Sometimes the =pdfoutline/djvused= application is not able to add the TOC to the document. In that case you can either debug the problem by copying the used terminal command from the =*messages*= buffer and run it manually in the -document's folder, or you can delete the outline source buffer and run -=toc--tablist-to-handyoutliner= from the tablist buffer to get an outline source -file that can be used with [[http://handyoutlinerfo.sourceforge.net/][HandyOutliner]] (unfortunately the handyoutliner -command does not take arguments, but if you customize the [[help:toc-handyoutliner-path][toc-handyoutliner-path]] -and [[help:toc-file-browser-command][toc-file-browser-command]] variables, then Emacs will try to open -HandyOutliner and the file browser so that you can drag the file ~contents.txt~ -directly into HandyOutliner). +document's folder iside the terminal, or you can delete the outline source +buffer and run =toc--tablist-to-handyoutliner= from the tablist buffer to get an +outline source file that can be used with [[http://handyoutlinerfo.sourceforge.net/][HandyOutliner]] (unfortunately the +handyoutliner command does not take arguments, but if you customize the +[[help:toc-handyoutliner-path][toc-handyoutliner-path]] and [[help:toc-file-browser-command][toc-file-browser-command]] variables, then Emacs will +try to open HandyOutliner and the file browser so that you can drag the file +~contents.txt~ directly into HandyOutliner). diff --git a/toc-mode.el b/toc-mode.el index 43f884caf5..e514172daf 100644 --- a/toc-mode.el +++ b/toc-mode.el @@ -39,8 +39,19 @@ ;; Extraction with OCR requires the tesseract command line utility to be ;; available. -;; Usage: Extraction and adding contents to a document is done in 4 steps: 1 -;; extraction 2 cleanup 3 adjust/correct pagenumbers 4 add TOC to document +;; Usage: + +;; In each step below, check out available shortcuts using C-h m. Additionally +;; you can find available functions by typing the M-x mode-name (e.g. M-x +;; toc-cleanup), or with two dashes in the mode name (e.g. M-x toc--cleanup). Of +;; course if you use packages like Ivy or Helm you just use the fuzzy search +;; functionality. + +;; Extraction and adding contents to a document is done in 4 steps: +;; 1 extraction +;; 2 cleanup +;; 3 adjust/correct pagenumbers +;; 4 add TOC to document ;; 1. Extraction Open some pdf or djvu file in Emacs (pdf-tools and djvu package ;; recommended). Find the pagenumbers for the TOC. Then type M-x @@ -91,7 +102,7 @@ ;; automatically to the next line not ending with a number and joins it with the ;; next line. If the indentation structure of the different lines does not ;; correspond with the levels, then the levels can be set automatically from the -;; number of separatorss in the indices with M-x toc--cleanup-set-level-by-index. +;; number of separatorss in the indices with M-x toc-cleanup-set-level-by-index. ;; The default separators is a . but a different separators can be entered by ;; preceding the function invocation with the universal argument (C-u). Some ;; documents contain a structure like @@ -321,7 +332,7 @@ When ARG is non-nil it skips the last three steps" (string-list (split-string index sep t))) (length string-list))) -(defun toc--cleanup-set-level-by-index (&optional arg) +(defun toc-cleanup-set-level-by-index (&optional arg) "Automatic set indentation by number of separatorss in index. By default uses dots as separators. Prepend with universal ARG (\\[universal-argument]) to enter different separators." @@ -494,10 +505,13 @@ Prompt for startpage and endpage and print OCR output to new buffer." ;;;; toc major modes -(define-key pdf-view-mode-map (kbd "C-c C-e") 'toc-extract-pages) -(define-key djvu-read-mode-map (kbd "C-c C-e") 'toc-extract-pages) -(define-key pdf-view-mode-map (kbd "C-c e") 'toc-extract-pages-ocr) -(define-key djvu-read-mode-map (kbd "C-c e") 'toc-extract-pages-ocr) +(when (require 'pdf-tools nil t) + (define-key pdf-view-mode-map (kbd "C-c C-e") 'toc-extract-pages) + (define-key pdf-view-mode-map (kbd "C-c e") 'toc-extract-pages-ocr)) + +(when (require 'djvu nil t) + (define-key djvu-read-mode-map (kbd "C-c C-e") 'toc-extract-pages) + (define-key djvu-read-mode-map (kbd "C-c e") 'toc-extract-pages-ocr)) (defvar toc-cleanup-mode-map (let ((map (make-sparse-keymap)))