home

My NixOS systems configurations.
Log | Files | Refs | LICENSE

org-protocol-capture-html.el (12166B)


      1 ;;; org-protocol-capture-html.el --- Capture HTML with org-protocol
      2 
      3 ;; URL: https://github.com/alphapapa/org-protocol-capture-html
      4 ;; Version: 0.1-pre
      5 ;; Package-Requires: ((emacs "24.4"))
      6 
      7 ;;; Commentary:
      8 
      9 ;; This package captures Web pages into Org-mode using Pandoc to
     10 ;; process HTML.  It can also use eww's eww-readable functionality to
     11 ;; get the main content of a page.
     12 
     13 ;; These are the helper functions that run in Emacs.  To capture pages
     14 ;; into Emacs, you can use either a browser bookmarklet or the
     15 ;; org-protocol-capture-html.sh shell script.  See the README.org file
     16 ;; for instructions.
     17 
     18 ;;; License:
     19 
     20 ;; This program is free software; you can redistribute it and/or modify
     21 ;; it under the terms of the GNU General Public License as published by
     22 ;; the Free Software Foundation, either version 3 of the License, or
     23 ;; (at your option) any later version.
     24 
     25 ;; This program is distributed in the hope that it will be useful,
     26 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
     27 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     28 ;; GNU General Public License for more details.
     29 
     30 ;; You should have received a copy of the GNU General Public License
     31 ;; along with this program.  If not, see <http://www.gnu.org/licenses/>.
     32 
     33 ;;; Code:
     34 
     35 ;;;; Require
     36 
     37 (require 'org-protocol)
     38 (require 'cl-lib)
     39 (require 'subr-x)
     40 (require 's)
     41 
     42 ;;;; Vars
     43 
     44 (defcustom org-protocol-capture-html-demote-times 1
     45   "How many times to demote headings in captured pages.
     46 You may want to increase this if you use a sub-heading in your capture template."
     47   :group 'org-protocol-capture-html :type 'integer)
     48 
     49 ;;;; Test Pandoc
     50 
     51 (defconst org-protocol-capture-html-pandoc-no-wrap-option nil
     52   ;; Set this so it won't be unbound
     53   "Option to pass to Pandoc to disable wrapping.
     54 Pandoc >= 1.16 deprecates `--no-wrap' in favor of
     55 `--wrap=none'.")
     56 
     57 (defun org-protocol-capture-html--define-pandoc-wrap-const ()
     58   "Set `org-protocol-capture-html-pandoc-no-wrap-option'."
     59   (setq org-protocol-capture-html-pandoc-no-wrap-option
     60         ;; Pandoc >= 1.16 deprecates the --no-wrap option, replacing it with
     61         ;; --wrap=none.  Sending the wrong option causes output to STDERR,
     62         ;; which `call-process-region' doesn't like.  So we test Pandoc to see
     63         ;; which option to use.
     64         (with-temp-buffer
     65           (let* ((process (start-process "test-pandoc" (current-buffer) "pandoc" "--dump-args" "--no-wrap"))
     66                  (limit 3)
     67                  (checked 0))
     68             (while (process-live-p process)
     69               (if (= checked limit)
     70                   (progn
     71                     ;; Pandoc didn't exit in time.  Kill it and raise
     72                     ;; an error.  This function will return `nil' and
     73                     ;; `org-protocol-capture-html-pandoc-no-wrap-option'
     74                     ;; will remain `nil', which will cause this
     75                     ;; function to run again and set the const when a
     76                     ;; capture is run.
     77                     (set-process-query-on-exit-flag process nil)
     78                     (error "Unable to test Pandoc!  Please report this bug! (include the output of \"pandoc --dump-args --no-wrap\")"))
     79                 (sleep-for 0.2)
     80                 (cl-incf checked)))
     81             (if (and (zerop (process-exit-status process))
     82                      (not (string-match "--no-wrap is deprecated" (buffer-string))))
     83                 "--no-wrap"
     84               "--wrap=none")))))
     85 
     86 ;;;; Direct-to-Pandoc
     87 
     88 (defun org-protocol-capture-html--with-pandoc (data)
     89   "Process an org-protocol://capture-html:// URL using DATA.
     90 
     91 This function is basically a copy of `org-protocol-do-capture', but
     92 it passes the captured content (not the URL or title) through
     93 Pandoc, converting HTML to Org-mode."
     94 
     95   ;; It would be nice to not basically duplicate
     96   ;; `org-protocol-do-capture', but passing the data back to that
     97   ;; function would require re-encoding the data into a URL string
     98   ;; with Emacs after Pandoc converts it.  Since we've already split
     99   ;; it up, we might as well go ahead and run the capture directly.
    100 
    101   (unless org-protocol-capture-html-pandoc-no-wrap-option
    102     (org-protocol-capture-html--define-pandoc-wrap-const))
    103 
    104   (let* ((template (or (plist-get data :template)
    105                        org-protocol-default-template-key))
    106          (url (org-protocol-sanitize-uri (plist-get data :url)))
    107          (type (if (string-match "^\\([a-z]+\\):" url)
    108                    (match-string 1 url)))
    109          (title (or (org-protocol-capture-html--nbsp-to-space (string-trim (plist-get data :title))) ""))
    110          (content (or (org-protocol-capture-html--nbsp-to-space (string-trim (plist-get data :body))) ""))
    111          (orglink (org-make-link-string
    112                    url (if (string-match "[^[:space:]]" title) title url)))
    113          (org-capture-link-is-already-stored t)) ; avoid call to org-store-link
    114 
    115     (setq org-stored-links
    116           (cons (list url title) org-stored-links))
    117     (kill-new orglink)
    118 
    119     (with-temp-buffer
    120       (insert content)
    121       (if (not (zerop (call-process-region
    122                        (point-min) (point-max)
    123                        "pandoc" t t nil "-f" "html" "-t" "org" org-protocol-capture-html-pandoc-no-wrap-option)))
    124           (message "Pandoc failed: %s" (buffer-string))
    125         (progn
    126           ;; Pandoc succeeded
    127           (org-store-link-props :type type
    128                                 :annotation orglink
    129                                 :link url
    130                                 :description title
    131                                 :orglink orglink
    132                                 :initial (buffer-string)))))
    133     (org-protocol-capture-html--do-capture)
    134     nil))
    135 
    136 (add-to-list 'org-protocol-protocol-alist
    137              '("capture-html"
    138                :protocol "capture-html"
    139                :function org-protocol-capture-html--with-pandoc
    140                :kill-client t))
    141 
    142 ;;;; eww-readable
    143 
    144 (defvar url-http-end-of-headers)
    145 
    146 (eval-when-compile
    147   ;; eww-readable only works on Emacs >=25.1, but I think it's better
    148   ;; to check for the actual symbols.  I think using
    149   ;; `eval-when-compile' is the right way to do this, but I'm not
    150   ;; sure.
    151   (when (and (require 'eww nil t)
    152              (require 'dom nil t)
    153              (fboundp 'eww-score-readability))
    154 
    155     (defun org-protocol-capture-html--capture-eww-readable (data)
    156       "Capture content of URL with eww-readable.."
    157 
    158       (unless org-protocol-capture-html-pandoc-no-wrap-option
    159         (org-protocol-capture-html--define-pandoc-wrap-const))
    160 
    161       (let* ((template (or (plist-get data :template)
    162                            org-protocol-default-template-key))
    163              (url (org-protocol-sanitize-uri (plist-get data :url)))
    164              (type (if (string-match "^\\([a-z]+\\):" url)
    165                        (match-string 1 url)))
    166              (html (org-protocol-capture-html--url-html url))
    167              (result (org-protocol-capture-html--eww-readable html))
    168              (title (cdr result))
    169              (content (with-temp-buffer
    170                         (insert (org-protocol-capture-html--nbsp-to-space (car result)))
    171                         ;; Convert to Org with Pandoc
    172                         (unless (= 0 (call-process-region (point-min) (point-max)
    173                                                           "pandoc" t t nil "-f" "html" "-t" "org"
    174                                                           org-protocol-capture-html-pandoc-no-wrap-option))
    175                           (error "Pandoc failed"))
    176                         (save-excursion
    177                           ;; Remove DOS CR/LF line endings
    178                           (goto-char (point-min))
    179                           (while (search-forward (string ?\C-m) nil t)
    180                             (replace-match "")))
    181                         ;; Demote page headings in capture buffer to below the
    182                         ;; top-level Org heading and "Article" 2nd-level heading
    183                         (save-excursion
    184                           (goto-char (point-min))
    185                           (while (re-search-forward (rx bol (1+ "*") (1+ space)) nil t)
    186                             (beginning-of-line)
    187                             (insert "**")
    188                             (end-of-line)))
    189                         (buffer-string)))
    190              (orglink (org-make-link-string
    191                        url (if (s-present? title) title url)))
    192              ;; Avoid call to org-store-link
    193              (org-capture-link-is-already-stored t))
    194 
    195         (setq org-stored-links
    196               (cons (list url title) org-stored-links))
    197         (kill-new orglink)
    198 
    199         (org-store-link-props :type type
    200                               :annotation orglink
    201                               :link url
    202                               :description title
    203                               :orglink orglink
    204                               :initial content)
    205         (org-protocol-capture-html--do-capture)
    206         nil))
    207 
    208     (add-to-list 'org-protocol-protocol-alist
    209                  '("capture-eww-readable"
    210                    :protocol "capture-eww-readable"
    211                    :function org-protocol-capture-html--capture-eww-readable
    212                    :kill-client t))
    213 
    214     (defun org-protocol-capture-html--url-html (url)
    215       "Return HTML from URL as string."
    216       (let* ((response-buffer (url-retrieve-synchronously url nil t))
    217              (encoded-html (with-current-buffer response-buffer
    218                              (pop-to-buffer response-buffer)
    219                              ;; Skip HTTP headers, using marker provided by url-http
    220                              (delete-region (point-min) (1+ url-http-end-of-headers))
    221                              (buffer-string))))
    222         (kill-buffer response-buffer)     ; Not sure if necessary to avoid leaking buffer
    223         (with-temp-buffer
    224           ;; For some reason, running `decode-coding-region' in the
    225           ;; response buffer has no effect, so we have to do it in a
    226           ;; temp buffer.
    227           (insert encoded-html)
    228           (condition-case nil
    229               ;; Fix undecoded text
    230               (decode-coding-region (point-min) (point-max) 'utf-8)
    231             (coding-system-error nil))
    232           (buffer-string))))
    233 
    234     (defun org-protocol-capture-html--eww-readable (html)
    235       "Return `eww-readable' part of HTML with title.
    236 Returns list (HTML . TITLE)."
    237       ;; Based on `eww-readable'
    238       (let* ((html
    239               ;; Convert "&nbsp;" in HTML to plain spaces.
    240               ;; `libxml-parse-html-region' turns them into
    241               ;; underlines.  The closest I can find to an explanation
    242               ;; is at <http://www.perlmonks.org/?node_id=825188>.
    243               (org-protocol-capture-html--nbsp-to-space html))
    244              (dom (with-temp-buffer
    245                     (insert html)
    246                     (libxml-parse-html-region (point-min) (point-max))))
    247              (title (cl-caddr (car (dom-by-tag dom 'title)))))
    248         (eww-score-readability dom)
    249         (cons (with-temp-buffer
    250                 (shr-dom-print (eww-highest-readability dom))
    251                 (buffer-string))
    252               title)))))
    253 
    254 ;;;; Helper functions
    255 
    256 (defun org-protocol-capture-html--nbsp-to-space (s)
    257   "Convert HTML non-breaking spaces to plain spaces in S."
    258   ;; Not sure why sometimes these are in the HTML and Pandoc converts
    259   ;; them to underlines instead of spaces, but this fixes it.
    260   (replace-regexp-in-string (rx "&nbsp;") " " s t t))
    261 
    262 (with-no-warnings
    263   ;; Ignore warning about the dynamically scoped `template' variable.
    264   (defun org-protocol-capture-html--do-capture ()
    265     "Call `org-capture' and demote page headings in capture buffer."
    266     (raise-frame)
    267     (funcall 'org-capture nil template)
    268 
    269     ;; Demote page headings in capture buffer to below the
    270     ;; top-level Org heading
    271     (save-excursion
    272       (goto-char (point-min))
    273       (re-search-forward (rx bol "*" (1+ space)) nil t) ; Skip 1st heading
    274       (while (re-search-forward (rx bol "*" (1+ space)) nil t)
    275         (dotimes (n org-protocol-capture-html-demote-times)
    276           (org-demote-subtree))))))
    277 
    278 (provide 'org-protocol-capture-html)
    279 
    280 ;;; org-protocol-capture-html.el ends here