271 lines
12 KiB
EmacsLisp
271 lines
12 KiB
EmacsLisp
|
;;; org-protocol-capture-html.el --- Capture HTML with org-protocol
|
||
|
|
||
|
;;; Commentary:
|
||
|
|
||
|
;; This package captures Web pages into Org-mode using Pandoc to
|
||
|
;; process HTML. It can also use eww's eww-readable functionality to
|
||
|
;; get the main content of a page.
|
||
|
|
||
|
;; These are the helper functions that run in Emacs. To capture pages
|
||
|
;; into Emacs, you can use either a browser bookmarklet or the
|
||
|
;; org-protocol-capture-html.sh shell script. See the README.org file
|
||
|
;; for instructions.
|
||
|
|
||
|
;;; License:
|
||
|
|
||
|
;; This program is free software; you can redistribute it and/or modify
|
||
|
;; it under the terms of the GNU General Public License as published by
|
||
|
;; the Free Software Foundation, either version 3 of the License, or
|
||
|
;; (at your option) any later version.
|
||
|
|
||
|
;; This program is distributed in the hope that it will be useful,
|
||
|
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
;; GNU General Public License for more details.
|
||
|
|
||
|
;; You should have received a copy of the GNU General Public License
|
||
|
;; along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
||
|
;;; Code:
|
||
|
|
||
|
;;;; Require
|
||
|
|
||
|
(require 'org-protocol)
|
||
|
(require 'cl)
|
||
|
(require 'subr-x)
|
||
|
(require 's)
|
||
|
|
||
|
;;;; Vars
|
||
|
|
||
|
(defcustom org-protocol-capture-html-demote-times 1
|
||
|
"How many times to demote headings in captured pages.
|
||
|
You may want to increase this if you use a sub-heading in your capture template."
|
||
|
:group 'org-protocol-capture-html :type 'integer)
|
||
|
|
||
|
;;;; Test Pandoc
|
||
|
|
||
|
(defconst org-protocol-capture-html-pandoc-no-wrap-option nil
|
||
|
;; Set this so it won't be unbound
|
||
|
"Option to pass to Pandoc to disable wrapping. Pandoc >= 1.16
|
||
|
deprecates `--no-wrap' in favor of `--wrap=none'.")
|
||
|
|
||
|
(defun org-protocol-capture-html--define-pandoc-wrap-const ()
|
||
|
"Set `org-protocol-capture-html-pandoc-no-wrap-option'."
|
||
|
(setq org-protocol-capture-html-pandoc-no-wrap-option
|
||
|
;; Pandoc >= 1.16 deprecates the --no-wrap option, replacing it with
|
||
|
;; --wrap=none. Sending the wrong option causes output to STDERR,
|
||
|
;; which `call-process-region' doesn't like. So we test Pandoc to see
|
||
|
;; which option to use.
|
||
|
(with-temp-buffer
|
||
|
(let* ((process (start-process "test-pandoc" (current-buffer) "pandoc" "--dump-args" "--no-wrap"))
|
||
|
(limit 3)
|
||
|
(checked 0))
|
||
|
(while (process-live-p process)
|
||
|
(if (= checked limit)
|
||
|
(progn
|
||
|
;; Pandoc didn't exit in time. Kill it and raise
|
||
|
;; an error. This function will return `nil' and
|
||
|
;; `org-protocol-capture-html-pandoc-no-wrap-option'
|
||
|
;; will remain `nil', which will cause this
|
||
|
;; function to run again and set the const when a
|
||
|
;; capture is run.
|
||
|
(set-process-query-on-exit-flag process nil)
|
||
|
(error "Unable to test Pandoc! Please report this bug! (include the output of \"pandoc --dump-args --no-wrap\")"))
|
||
|
(sleep-for 0.2)
|
||
|
(cl-incf checked)))
|
||
|
(if (and (zerop (process-exit-status process))
|
||
|
(not (string-match "--no-wrap is deprecated" (buffer-string))))
|
||
|
"--no-wrap"
|
||
|
"--wrap=none")))))
|
||
|
|
||
|
;;;; Direct-to-Pandoc
|
||
|
|
||
|
(defun org-protocol-capture-html--with-pandoc (data)
|
||
|
"Process an org-protocol://capture-html:// URL.
|
||
|
This function is basically a copy of `org-protocol-do-capture', but
|
||
|
it passes the captured content (not the URL or title) through
|
||
|
Pandoc, converting HTML to Org-mode."
|
||
|
|
||
|
;; It would be nice to not basically duplicate
|
||
|
;; `org-protocol-do-capture', but passing the data back to that
|
||
|
;; function would require re-encoding the data into a URL string
|
||
|
;; with Emacs after Pandoc converts it. Since we've already split
|
||
|
;; it up, we might as well go ahead and run the capture directly.
|
||
|
|
||
|
(unless org-protocol-capture-html-pandoc-no-wrap-option
|
||
|
(org-protocol-capture-html--define-pandoc-wrap-const))
|
||
|
|
||
|
(let* ((template (or (plist-get data :template)
|
||
|
org-protocol-default-template-key))
|
||
|
(url (org-protocol-sanitize-uri (plist-get data :url)))
|
||
|
(type (if (string-match "^\\([a-z]+\\):" url)
|
||
|
(match-string 1 url)))
|
||
|
(title (or (org-protocol-capture-html--nbsp-to-space (string-trim (plist-get data :title))) ""))
|
||
|
(content (or (org-protocol-capture-html--nbsp-to-space (string-trim (plist-get data :body))) ""))
|
||
|
(orglink (org-make-link-string
|
||
|
url (if (string-match "[^[:space:]]" title) title url)))
|
||
|
(org-capture-link-is-already-stored t)) ; avoid call to org-store-link
|
||
|
|
||
|
(setq org-stored-links
|
||
|
(cons (list url title) org-stored-links))
|
||
|
(kill-new orglink)
|
||
|
|
||
|
(with-temp-buffer
|
||
|
(insert content)
|
||
|
(if (not (zerop (call-process-region
|
||
|
(point-min) (point-max)
|
||
|
"pandoc" t t nil "-f" "html" "-t" "org" org-protocol-capture-html-pandoc-no-wrap-option)))
|
||
|
(message "Pandoc failed: %s" (buffer-string))
|
||
|
(progn
|
||
|
;; Pandoc succeeded
|
||
|
(org-store-link-props :type type
|
||
|
:annotation orglink
|
||
|
:link url
|
||
|
:description title
|
||
|
:orglink orglink
|
||
|
:initial (buffer-string)))))
|
||
|
(org-protocol-capture-html--do-capture)
|
||
|
nil))
|
||
|
|
||
|
(add-to-list 'org-protocol-protocol-alist
|
||
|
'("capture-html"
|
||
|
:protocol "capture-html"
|
||
|
:function org-protocol-capture-html--with-pandoc
|
||
|
:kill-client t))
|
||
|
|
||
|
;;;; eww-readable
|
||
|
|
||
|
(eval-when-compile
|
||
|
;; eww-readable only works on Emacs >=25.1, but I think it's better
|
||
|
;; to check for the actual symbols. I think using
|
||
|
;; `eval-when-compile' is the right way to do this, but I'm not
|
||
|
;; sure.
|
||
|
(when (and (require 'eww nil t)
|
||
|
(require 'dom nil t)
|
||
|
(fboundp 'eww-score-readability))
|
||
|
|
||
|
(defun org-protocol-capture-html--capture-eww-readable (data)
|
||
|
"Capture content of URL with eww-readable.."
|
||
|
|
||
|
(unless org-protocol-capture-html-pandoc-no-wrap-option
|
||
|
(org-protocol-capture-html--define-pandoc-wrap-const))
|
||
|
|
||
|
(let* ((template (or (plist-get data :template)
|
||
|
org-protocol-default-template-key))
|
||
|
(url (org-protocol-sanitize-uri (plist-get data :url)))
|
||
|
(type (if (string-match "^\\([a-z]+\\):" url)
|
||
|
(match-string 1 url)))
|
||
|
(html (org-protocol-capture-html--url-html url))
|
||
|
(result (org-protocol-capture-html--eww-readable html))
|
||
|
(title (cdr result))
|
||
|
(content (with-temp-buffer
|
||
|
(insert (org-protocol-capture-html--nbsp-to-space (car result)))
|
||
|
;; Convert to Org with Pandoc
|
||
|
(unless (= 0 (call-process-region (point-min) (point-max)
|
||
|
"pandoc" t t nil "-f" "html" "-t" "org"
|
||
|
org-protocol-capture-html-pandoc-no-wrap-option))
|
||
|
(error "Pandoc failed."))
|
||
|
(save-excursion
|
||
|
;; Remove DOS CR/LF line endings
|
||
|
(goto-char (point-min))
|
||
|
(while (search-forward (string ?\C-m) nil t)
|
||
|
(replace-match "")))
|
||
|
;; Demote page headings in capture buffer to below the
|
||
|
;; top-level Org heading and "Article" 2nd-level heading
|
||
|
(save-excursion
|
||
|
(goto-char (point-min))
|
||
|
(while (re-search-forward (rx bol (1+ "*") (1+ space)) nil t)
|
||
|
(beginning-of-line)
|
||
|
(insert "**")
|
||
|
(end-of-line)))
|
||
|
(buffer-string)))
|
||
|
(orglink (org-make-link-string
|
||
|
url (if (s-present? title) title url)))
|
||
|
;; Avoid call to org-store-link
|
||
|
(org-capture-link-is-already-stored t))
|
||
|
|
||
|
(setq org-stored-links
|
||
|
(cons (list url title) org-stored-links))
|
||
|
(kill-new orglink)
|
||
|
|
||
|
(org-store-link-props :type type
|
||
|
:annotation orglink
|
||
|
:link url
|
||
|
:description title
|
||
|
:orglink orglink
|
||
|
:initial content)
|
||
|
(org-protocol-capture-html--do-capture)
|
||
|
nil))
|
||
|
|
||
|
(add-to-list 'org-protocol-protocol-alist
|
||
|
'("capture-eww-readable"
|
||
|
:protocol "capture-eww-readable"
|
||
|
:function org-protocol-capture-html--capture-eww-readable
|
||
|
:kill-client t))
|
||
|
|
||
|
(defun org-protocol-capture-html--url-html (url)
|
||
|
"Return HTML from URL as string."
|
||
|
(let* ((response-buffer (url-retrieve-synchronously url nil t))
|
||
|
(encoded-html (with-current-buffer response-buffer
|
||
|
(pop-to-buffer response-buffer)
|
||
|
;; Skip HTTP headers, using marker provided by url-http
|
||
|
(delete-region (point-min) (1+ url-http-end-of-headers))
|
||
|
(buffer-string))))
|
||
|
(kill-buffer response-buffer) ; Not sure if necessary to avoid leaking buffer
|
||
|
(with-temp-buffer
|
||
|
;; For some reason, running `decode-coding-region' in the
|
||
|
;; response buffer has no effect, so we have to do it in a
|
||
|
;; temp buffer.
|
||
|
(insert encoded-html)
|
||
|
(condition-case nil
|
||
|
;; Fix undecoded text
|
||
|
(decode-coding-region (point-min) (point-max) 'utf-8)
|
||
|
(coding-system-error nil))
|
||
|
(buffer-string))))
|
||
|
|
||
|
(defun org-protocol-capture-html--eww-readable (html)
|
||
|
"Return `eww-readable' part of HTML with title.
|
||
|
Returns list (HTML . TITLE)."
|
||
|
;; Based on `eww-readable'
|
||
|
(let* ((html
|
||
|
;; Convert " " in HTML to plain spaces.
|
||
|
;; `libxml-parse-html-region' turns them into
|
||
|
;; underlines. The closest I can find to an explanation
|
||
|
;; is at <http://www.perlmonks.org/?node_id=825188>.
|
||
|
(org-protocol-capture-html--nbsp-to-space html))
|
||
|
(dom (with-temp-buffer
|
||
|
(insert html)
|
||
|
(libxml-parse-html-region (point-min) (point-max))))
|
||
|
(title (caddr (car (dom-by-tag dom 'title)))))
|
||
|
(eww-score-readability dom)
|
||
|
(cons (with-temp-buffer
|
||
|
(shr-dom-print (eww-highest-readability dom))
|
||
|
(buffer-string))
|
||
|
title)))))
|
||
|
|
||
|
;;;; Helper functions
|
||
|
|
||
|
(defun org-protocol-capture-html--nbsp-to-space (s)
|
||
|
"Convert HTML non-breaking spaces to plain spaces in S."
|
||
|
;; Not sure why sometimes these are in the HTML and Pandoc converts
|
||
|
;; them to underlines instead of spaces, but this fixes it.
|
||
|
(replace-regexp-in-string (rx " ") " " s t t))
|
||
|
|
||
|
(defun org-protocol-capture-html--do-capture ()
|
||
|
"Call `org-capture' and demote page headings in capture buffer."
|
||
|
(raise-frame)
|
||
|
(funcall 'org-capture nil template)
|
||
|
|
||
|
;; Demote page headings in capture buffer to below the
|
||
|
;; top-level Org heading
|
||
|
(save-excursion
|
||
|
(goto-char (point-min))
|
||
|
(re-search-forward (rx bol "*" (1+ space)) nil t) ; Skip 1st heading
|
||
|
(while (re-search-forward (rx bol "*" (1+ space)) nil t)
|
||
|
(dotimes (n org-protocol-capture-html-demote-times)
|
||
|
(org-demote-subtree)))))
|
||
|
|
||
|
(provide 'org-protocol-capture-html)
|
||
|
|
||
|
;;; org-protocol-capture-html ends here
|