org-protocol-capture-html.el (12166B)
1 ;;; org-protocol-capture-html.el --- Capture HTML with org-protocol 2 3 ;; URL: https://github.com/alphapapa/org-protocol-capture-html 4 ;; Version: 0.1-pre 5 ;; Package-Requires: ((emacs "24.4")) 6 7 ;;; Commentary: 8 9 ;; This package captures Web pages into Org-mode using Pandoc to 10 ;; process HTML. It can also use eww's eww-readable functionality to 11 ;; get the main content of a page. 12 13 ;; These are the helper functions that run in Emacs. To capture pages 14 ;; into Emacs, you can use either a browser bookmarklet or the 15 ;; org-protocol-capture-html.sh shell script. See the README.org file 16 ;; for instructions. 17 18 ;;; License: 19 20 ;; This program is free software; you can redistribute it and/or modify 21 ;; it under the terms of the GNU General Public License as published by 22 ;; the Free Software Foundation, either version 3 of the License, or 23 ;; (at your option) any later version. 24 25 ;; This program is distributed in the hope that it will be useful, 26 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of 27 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 28 ;; GNU General Public License for more details. 29 30 ;; You should have received a copy of the GNU General Public License 31 ;; along with this program. If not, see <http://www.gnu.org/licenses/>. 32 33 ;;; Code: 34 35 ;;;; Require 36 37 (require 'org-protocol) 38 (require 'cl-lib) 39 (require 'subr-x) 40 (require 's) 41 42 ;;;; Vars 43 44 (defcustom org-protocol-capture-html-demote-times 1 45 "How many times to demote headings in captured pages. 46 You may want to increase this if you use a sub-heading in your capture template." 47 :group 'org-protocol-capture-html :type 'integer) 48 49 ;;;; Test Pandoc 50 51 (defconst org-protocol-capture-html-pandoc-no-wrap-option nil 52 ;; Set this so it won't be unbound 53 "Option to pass to Pandoc to disable wrapping. 54 Pandoc >= 1.16 deprecates `--no-wrap' in favor of 55 `--wrap=none'.") 56 57 (defun org-protocol-capture-html--define-pandoc-wrap-const () 58 "Set `org-protocol-capture-html-pandoc-no-wrap-option'." 59 (setq org-protocol-capture-html-pandoc-no-wrap-option 60 ;; Pandoc >= 1.16 deprecates the --no-wrap option, replacing it with 61 ;; --wrap=none. Sending the wrong option causes output to STDERR, 62 ;; which `call-process-region' doesn't like. So we test Pandoc to see 63 ;; which option to use. 64 (with-temp-buffer 65 (let* ((process (start-process "test-pandoc" (current-buffer) "pandoc" "--dump-args" "--no-wrap")) 66 (limit 3) 67 (checked 0)) 68 (while (process-live-p process) 69 (if (= checked limit) 70 (progn 71 ;; Pandoc didn't exit in time. Kill it and raise 72 ;; an error. This function will return `nil' and 73 ;; `org-protocol-capture-html-pandoc-no-wrap-option' 74 ;; will remain `nil', which will cause this 75 ;; function to run again and set the const when a 76 ;; capture is run. 77 (set-process-query-on-exit-flag process nil) 78 (error "Unable to test Pandoc! Please report this bug! (include the output of \"pandoc --dump-args --no-wrap\")")) 79 (sleep-for 0.2) 80 (cl-incf checked))) 81 (if (and (zerop (process-exit-status process)) 82 (not (string-match "--no-wrap is deprecated" (buffer-string)))) 83 "--no-wrap" 84 "--wrap=none"))))) 85 86 ;;;; Direct-to-Pandoc 87 88 (defun org-protocol-capture-html--with-pandoc (data) 89 "Process an org-protocol://capture-html:// URL using DATA. 90 91 This function is basically a copy of `org-protocol-do-capture', but 92 it passes the captured content (not the URL or title) through 93 Pandoc, converting HTML to Org-mode." 94 95 ;; It would be nice to not basically duplicate 96 ;; `org-protocol-do-capture', but passing the data back to that 97 ;; function would require re-encoding the data into a URL string 98 ;; with Emacs after Pandoc converts it. Since we've already split 99 ;; it up, we might as well go ahead and run the capture directly. 100 101 (unless org-protocol-capture-html-pandoc-no-wrap-option 102 (org-protocol-capture-html--define-pandoc-wrap-const)) 103 104 (let* ((template (or (plist-get data :template) 105 org-protocol-default-template-key)) 106 (url (org-protocol-sanitize-uri (plist-get data :url))) 107 (type (if (string-match "^\\([a-z]+\\):" url) 108 (match-string 1 url))) 109 (title (or (org-protocol-capture-html--nbsp-to-space (string-trim (plist-get data :title))) "")) 110 (content (or (org-protocol-capture-html--nbsp-to-space (string-trim (plist-get data :body))) "")) 111 (orglink (org-make-link-string 112 url (if (string-match "[^[:space:]]" title) title url))) 113 (org-capture-link-is-already-stored t)) ; avoid call to org-store-link 114 115 (setq org-stored-links 116 (cons (list url title) org-stored-links)) 117 (kill-new orglink) 118 119 (with-temp-buffer 120 (insert content) 121 (if (not (zerop (call-process-region 122 (point-min) (point-max) 123 "pandoc" t t nil "-f" "html" "-t" "org" org-protocol-capture-html-pandoc-no-wrap-option))) 124 (message "Pandoc failed: %s" (buffer-string)) 125 (progn 126 ;; Pandoc succeeded 127 (org-store-link-props :type type 128 :annotation orglink 129 :link url 130 :description title 131 :orglink orglink 132 :initial (buffer-string))))) 133 (org-protocol-capture-html--do-capture) 134 nil)) 135 136 (add-to-list 'org-protocol-protocol-alist 137 '("capture-html" 138 :protocol "capture-html" 139 :function org-protocol-capture-html--with-pandoc 140 :kill-client t)) 141 142 ;;;; eww-readable 143 144 (defvar url-http-end-of-headers) 145 146 (eval-when-compile 147 ;; eww-readable only works on Emacs >=25.1, but I think it's better 148 ;; to check for the actual symbols. I think using 149 ;; `eval-when-compile' is the right way to do this, but I'm not 150 ;; sure. 151 (when (and (require 'eww nil t) 152 (require 'dom nil t) 153 (fboundp 'eww-score-readability)) 154 155 (defun org-protocol-capture-html--capture-eww-readable (data) 156 "Capture content of URL with eww-readable.." 157 158 (unless org-protocol-capture-html-pandoc-no-wrap-option 159 (org-protocol-capture-html--define-pandoc-wrap-const)) 160 161 (let* ((template (or (plist-get data :template) 162 org-protocol-default-template-key)) 163 (url (org-protocol-sanitize-uri (plist-get data :url))) 164 (type (if (string-match "^\\([a-z]+\\):" url) 165 (match-string 1 url))) 166 (html (org-protocol-capture-html--url-html url)) 167 (result (org-protocol-capture-html--eww-readable html)) 168 (title (cdr result)) 169 (content (with-temp-buffer 170 (insert (org-protocol-capture-html--nbsp-to-space (car result))) 171 ;; Convert to Org with Pandoc 172 (unless (= 0 (call-process-region (point-min) (point-max) 173 "pandoc" t t nil "-f" "html" "-t" "org" 174 org-protocol-capture-html-pandoc-no-wrap-option)) 175 (error "Pandoc failed")) 176 (save-excursion 177 ;; Remove DOS CR/LF line endings 178 (goto-char (point-min)) 179 (while (search-forward (string ?\C-m) nil t) 180 (replace-match ""))) 181 ;; Demote page headings in capture buffer to below the 182 ;; top-level Org heading and "Article" 2nd-level heading 183 (save-excursion 184 (goto-char (point-min)) 185 (while (re-search-forward (rx bol (1+ "*") (1+ space)) nil t) 186 (beginning-of-line) 187 (insert "**") 188 (end-of-line))) 189 (buffer-string))) 190 (orglink (org-make-link-string 191 url (if (s-present? title) title url))) 192 ;; Avoid call to org-store-link 193 (org-capture-link-is-already-stored t)) 194 195 (setq org-stored-links 196 (cons (list url title) org-stored-links)) 197 (kill-new orglink) 198 199 (org-store-link-props :type type 200 :annotation orglink 201 :link url 202 :description title 203 :orglink orglink 204 :initial content) 205 (org-protocol-capture-html--do-capture) 206 nil)) 207 208 (add-to-list 'org-protocol-protocol-alist 209 '("capture-eww-readable" 210 :protocol "capture-eww-readable" 211 :function org-protocol-capture-html--capture-eww-readable 212 :kill-client t)) 213 214 (defun org-protocol-capture-html--url-html (url) 215 "Return HTML from URL as string." 216 (let* ((response-buffer (url-retrieve-synchronously url nil t)) 217 (encoded-html (with-current-buffer response-buffer 218 (pop-to-buffer response-buffer) 219 ;; Skip HTTP headers, using marker provided by url-http 220 (delete-region (point-min) (1+ url-http-end-of-headers)) 221 (buffer-string)))) 222 (kill-buffer response-buffer) ; Not sure if necessary to avoid leaking buffer 223 (with-temp-buffer 224 ;; For some reason, running `decode-coding-region' in the 225 ;; response buffer has no effect, so we have to do it in a 226 ;; temp buffer. 227 (insert encoded-html) 228 (condition-case nil 229 ;; Fix undecoded text 230 (decode-coding-region (point-min) (point-max) 'utf-8) 231 (coding-system-error nil)) 232 (buffer-string)))) 233 234 (defun org-protocol-capture-html--eww-readable (html) 235 "Return `eww-readable' part of HTML with title. 236 Returns list (HTML . TITLE)." 237 ;; Based on `eww-readable' 238 (let* ((html 239 ;; Convert " " in HTML to plain spaces. 240 ;; `libxml-parse-html-region' turns them into 241 ;; underlines. The closest I can find to an explanation 242 ;; is at <http://www.perlmonks.org/?node_id=825188>. 243 (org-protocol-capture-html--nbsp-to-space html)) 244 (dom (with-temp-buffer 245 (insert html) 246 (libxml-parse-html-region (point-min) (point-max)))) 247 (title (cl-caddr (car (dom-by-tag dom 'title))))) 248 (eww-score-readability dom) 249 (cons (with-temp-buffer 250 (shr-dom-print (eww-highest-readability dom)) 251 (buffer-string)) 252 title))))) 253 254 ;;;; Helper functions 255 256 (defun org-protocol-capture-html--nbsp-to-space (s) 257 "Convert HTML non-breaking spaces to plain spaces in S." 258 ;; Not sure why sometimes these are in the HTML and Pandoc converts 259 ;; them to underlines instead of spaces, but this fixes it. 260 (replace-regexp-in-string (rx " ") " " s t t)) 261 262 (with-no-warnings 263 ;; Ignore warning about the dynamically scoped `template' variable. 264 (defun org-protocol-capture-html--do-capture () 265 "Call `org-capture' and demote page headings in capture buffer." 266 (raise-frame) 267 (funcall 'org-capture nil template) 268 269 ;; Demote page headings in capture buffer to below the 270 ;; top-level Org heading 271 (save-excursion 272 (goto-char (point-min)) 273 (re-search-forward (rx bol "*" (1+ space)) nil t) ; Skip 1st heading 274 (while (re-search-forward (rx bol "*" (1+ space)) nil t) 275 (dotimes (n org-protocol-capture-html-demote-times) 276 (org-demote-subtree)))))) 277 278 (provide 'org-protocol-capture-html) 279 280 ;;; org-protocol-capture-html.el ends here