branch: elpa/subed commit 2c1020ffb5b859a67040a3a49f83396ddc321d4d Author: Sacha Chua <sa...@sachachua.com> Commit: Sacha Chua <sa...@sachachua.com>
Load word data from Youtube VTTs * subed/subed-word-data.el (subed-word-data--extract-words-from-youtube-vtt): New. (subed-word-data--load): Update documentation. (subed-word-data-load-from-file): Load from Youtube VTTs as well. (subed-word-data-load-from-string): Load from Youtube VTTs as well. (subed-word-data-load-maybe): Check if data was loaded. --- subed/subed-word-data.el | 79 +++++++++++++++++++++++++++++++++++++----------- subed/subed.el | 2 +- 2 files changed, 63 insertions(+), 18 deletions(-) diff --git a/subed/subed-word-data.el b/subed/subed-word-data.el index 53f29f8c63..2504ce8343 100644 --- a/subed/subed-word-data.el +++ b/subed/subed-word-data.el @@ -78,8 +78,47 @@ Return a list of ((start . ?), (end . ?) (text . ?))." rec)) text-elements)))) +(defun subed-word-data--extract-words-from-youtube-vtt (file &optional from-string) + "Extract the timing from FILE which is a VTT from YouTube. +Return a list of ((start . ?), (end . ?) (text . ?)). +If FROM-STRING is non-nil, treat FILE as the data itself." + (with-temp-buffer + (subed-vtt-mode) + (if from-string + (insert file) + (insert-file-contents file)) + (let ((list (subed-subtitle-list)) + results + s + start + stop + i) + (dolist (sub list) + (when (string-match "<c>" (elt sub 3)) + (setq s (elt sub 3)) + (setq i 0) + (setq start (elt sub 1)) + (while (and (< i (length s)) + (string-match "\\(.+?\\)<\\([0-9]+:[0-9]+:[0-9]+\\.[0-9]+\\)>" s i)) + (setq stop (1- (save-match-data (subed-timestamp-to-msecs (match-string 2 s))))) + (push `((text . ,(save-match-data + (string-trim (replace-regexp-in-string "</?c>" "" (match-string 1 s))))) + (start . ,start) + (end . ,stop)) + results) + (setq i (match-end 0) + start (1+ stop))) + (if (and (< i (length s)) + (not (string= "" (string-trim (substring s i))))) + (push `((text . ,(string-trim + (save-match-data (replace-regexp-in-string "</?c>" "" (substring s i))))) + (start . ,start) + (end . ,(elt sub 2))) + results)))) + (nreverse results)))) + (defun subed-word-data--extract-words-from-whisperx-json (file &optional from-string) - "Extract the timing from file in WhisperX's JSON format. + "Extract the timing from FILE in WhisperX's JSON format. Return a list of ((start . ?), (end . ?) (text . ?) (score . ?)). If FROM-STRING is non-nil, treat FILE as the data itself." (let* ((json-object-type 'alist) @@ -113,16 +152,18 @@ If FROM-STRING is non-nil, treat FILE as the data itself." (defun subed-word-data--load (data) "Load word-level timing from DATA. -For now, only SRV2 files are supported." - (setq-local subed-word-data--cache data) - (add-hook 'subed-split-subtitle-timestamp-functions #'subed-word-data-split-at-word-timestamp -5 t) - (subed-word-data-refresh-text-properties)) +Supports WhisperX JSON, YouTube VTT, and Youtube SRV2 files." + (when data + (setq-local subed-word-data--cache data) + (add-hook 'subed-split-subtitle-timestamp-functions #'subed-word-data-split-at-word-timestamp -5 t) + (subed-word-data-refresh-text-properties) + data)) ;;;###autoload (defun subed-word-data-load-from-file (file) "Load word-level timing from FILE. -For now, only SRV2 and JSON files are supported." - (interactive (list (read-file-name "JSON or srv2: " +Supports WhisperX JSON, YouTube VTT, and Youtube SRV2 files." + (interactive (list (read-file-name "JSON, VTT, or srv2: " nil nil nil @@ -130,21 +171,26 @@ For now, only SRV2 and JSON files are supported." (lambda (f) (or (file-directory-p f) (string-match - "\\.json\\'\\|\\.srv2\\'" + "\\.\\(json\\|srv2\\|vtt\\)\\'" f)))))) (subed-word-data--load - (if (and (stringp file) (string-match "\\.json\\'" file)) - (subed-word-data--extract-words-from-whisperx-json file) - (subed-word-data--extract-words-from-srv2 (xml-parse-file file))))) + (pcase (file-name-extension file) + ("json" (subed-word-data--extract-words-from-whisperx-json file)) + ("srv2" (subed-word-data--extract-words-from-srv2 (xml-parse-file file))) + ("vtt" (subed-word-data--extract-words-from-youtube-vtt file))))) (defun subed-word-data-load-from-string (string) "Load word-level timing from STRING. For now, only JSON or SRV2 files are supported." - (subed-word-data--load (if (string-match "^{" string) - (subed-word-data--extract-words-from-whisperx-json string t) - (subed-word-data--extract-words-from-srv2 string)))) + (subed-word-data--load (cond + ((string-match "^{" string) + (subed-word-data--extract-words-from-whisperx-json string t)) + ((string-match "^WEBVTT" string) + (subed-word-data--extract-words-from-youtube-vtt string t)) + (t + (subed-word-data--extract-words-from-srv2 string))))) -(defvar subed-word-data-extensions '(".en.srv2" ".srv2") "Extensions to search for word data.") +(defvar subed-word-data-extensions '(".en.srv2" ".srv2" ".json" ".vtt") "Extensions to search for word data.") ;;;###autoload (defun subed-word-data-load-maybe () @@ -157,8 +203,7 @@ For now, only JSON or SRV2 files are supported." (setq file (concat (file-name-sans-extension (buffer-file-name)) ext)) (throw 'found))) subed-word-data-extensions)) - (when file - (subed-word-data-load-from-file file) + (when (and file (subed-word-data-load-from-file file)) (message "Word data loaded."))))) (defvar subed-word-data-normalizing-functions '(subed-word-data-normalize-word-default) diff --git a/subed/subed.el b/subed/subed.el index e4228f18be..5e3f048fdd 100644 --- a/subed/subed.el +++ b/subed/subed.el @@ -1,6 +1,6 @@ ;;; subed.el --- A major mode for editing subtitles -*- lexical-binding: t; -*- -;; Version: 1.2.23 +;; Version: 1.2.24 ;; Maintainer: Sacha Chua <sa...@sachachua.com> ;; Author: Random User ;; Keywords: convenience, files, hypermedia, multimedia