180 lines
7.7 KiB
EmacsLisp
180 lines
7.7 KiB
EmacsLisp
;;; sx-encoding.el --- encoding -*- lexical-binding: t; -*-
|
||
|
||
;; Copyright (C) 2014 Sean Allred
|
||
|
||
;; Author: Sean Allred <code@seanallred.com>
|
||
|
||
;; This program is free software; you can redistribute it and/or modify
|
||
;; it under the terms of the GNU General Public License as published by
|
||
;; the Free Software Foundation, either version 3 of the License, or
|
||
;; (at your option) any later version.
|
||
|
||
;; This program is distributed in the hope that it will be useful,
|
||
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
;; GNU General Public License for more details.
|
||
|
||
;; You should have received a copy of the GNU General Public License
|
||
;; along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||
|
||
;;; Commentary:
|
||
|
||
;; This file handles decoding the responses we get from the API. They
|
||
;; are received either as plain-text or as a `gzip' compressed archive.
|
||
;; For this, `sx-encoding-gzipped-p' is used to determine if content
|
||
;; has been compressed under `gzip'.
|
||
|
||
;;; Code:
|
||
|
||
(require 'cl-lib)
|
||
|
||
|
||
;;;; HTML Encoding
|
||
|
||
(defcustom sx-encoding-html-entities-plist
|
||
'(Aacute "Á" aacute "á" Acirc "Â" acirc "â" acute "´" AElig "Æ" aelig "æ"
|
||
Agrave "À" agrave "à" alefsym "ℵ" Alpha "Α" alpha "α" amp "&" and "∧"
|
||
ang "∠" apos "'" aring "å" Aring "Å" asymp "≈" atilde "ã" Atilde "Ã"
|
||
auml "ä" Auml "Ä" bdquo "„" Beta "Β" beta "β" brvbar "¦" bull "•"
|
||
cap "∩" ccedil "ç" Ccedil "Ç" cedil "¸" cent "¢" Chi "Χ" chi "χ"
|
||
circ "ˆ" clubs "♣" cong "≅" copy "©" crarr "↵" cup "∪" curren "¤"
|
||
Dagger "‡" dagger "†" darr "↓" dArr "⇓" deg "°" Delta "Δ" delta "δ"
|
||
diams "♦" divide "÷" eacute "é" Eacute "É" ecirc "ê" Ecirc "Ê" egrave "è"
|
||
Egrave "È" empty "∅" emsp " " ensp " " Epsilon "Ε" epsilon "ε" equiv "≡"
|
||
Eta "Η" eta "η" eth "ð" ETH "Ð" euml "ë" Euml "Ë" euro "€"
|
||
exist "∃" fnof "ƒ" forall "∀" frac12 "½" frac14 "¼" frac34 "¾" frasl "⁄"
|
||
Gamma "Γ" gamma "γ" ge "≥" gt ">" harr "↔" hArr "⇔" hearts "♥"
|
||
hellip "…" iacute "í" Iacute "Í" icirc "î" Icirc "Î" iexcl "¡" igrave "ì"
|
||
Igrave "Ì" image "ℑ" infin "∞" int "∫" Iota "Ι" iota "ι" iquest "¿"
|
||
isin "∈" iuml "ï" Iuml "Ï" Kappa "Κ" kappa "κ" Lambda "Λ" lambda "λ"
|
||
lang "〈" laquo "«" larr "←" lArr "⇐" lceil "⌈" ldquo "“" le "≤"
|
||
lfloor "⌊" lowast "∗" loz "◊" lrm "" lsaquo "‹" lsquo "‘" lt "<"
|
||
macr "¯" mdash "—" micro "µ" middot "·" minus "−" Mu "Μ" mu "μ"
|
||
nabla "∇" nbsp " " ndash "–" ne "≠" ni "∋" not "¬" notin "∉"
|
||
nsub "⊄" ntilde "ñ" Ntilde "Ñ" Nu "Ν" nu "ν" oacute "ó" Oacute "Ó"
|
||
ocirc "ô" Ocirc "Ô" OElig "Œ" oelig "œ" ograve "ò" Ograve "Ò" oline "‾"
|
||
omega "ω" Omega "Ω" Omicron "Ο" omicron "ο" oplus "⊕" or "∨" ordf "ª"
|
||
ordm "º" oslash "ø" Oslash "Ø" otilde "õ" Otilde "Õ" otimes "⊗" ouml "ö"
|
||
Ouml "Ö" para "¶" part "∂" permil "‰" perp "⊥" Phi "Φ" phi "φ"
|
||
Pi "Π" pi "π" piv "ϖ" plusmn "±" pound "£" Prime "″" prime "′"
|
||
prod "∏" prop "∝" Psi "Ψ" psi "ψ" quot "\"" radic "√" rang "〉"
|
||
raquo "»" rarr "→" rArr "⇒" rceil "⌉" rdquo "”" real "ℜ" reg "®"
|
||
rfloor "⌋" Rho "Ρ" rho "ρ" rlm "" rsaquo "›" rsquo "’" sbquo "‚"
|
||
scaron "š" Scaron "Š" sdot "⋅" sect "§" shy "" Sigma "Σ" sigma "σ"
|
||
sigmaf "ς" sim "∼" spades "♠" sub "⊂" sube "⊆" sum "∑" sup "⊃"
|
||
sup1 "¹" sup2 "²" sup3 "³" supe "⊇" szlig "ß" Tau "Τ" tau "τ"
|
||
there4 "∴" Theta "Θ" theta "θ" thetasym "ϑ" thinsp " " thorn "þ" THORN "Þ"
|
||
tilde "˜" times "×" trade "™" uacute "ú" Uacute "Ú" uarr "↑" uArr "⇑"
|
||
ucirc "û" Ucirc "Û" ugrave "ù" Ugrave "Ù" uml "¨" upsih "ϒ" Upsilon "Υ"
|
||
upsilon "υ" uuml "ü" Uuml "Ü" weierp "℘" Xi "Ξ" xi "ξ" yacute "ý"
|
||
Yacute "Ý" yen "¥" yuml "ÿ" Yuml "Ÿ" Zeta "Ζ" zeta "ζ" zwj "" zwnj "")
|
||
"Plist of HTML entities and their respective glyphs.
|
||
See `sx-encoding-decode-entities'."
|
||
:type '(repeat (choice symbol string))
|
||
:group 'sx)
|
||
|
||
(defun sx-encoding-decode-entities (string)
|
||
"Decode HTML entities (e.g. \""\") in STRING.
|
||
|
||
Done according to `sx-encoding-html-entities-plist'. If this
|
||
list does not contain the entity, it is assumed to be a number
|
||
and converted to a string (with `char-to-string').
|
||
|
||
Return the decoded string."
|
||
(let* ((plist sx-encoding-html-entities-plist)
|
||
(get-function
|
||
(lambda (s)
|
||
(let ((ss (substring s 1 -1)))
|
||
;; Handle things like "
|
||
(or (plist-get plist (intern ss))
|
||
;; Handle things like '
|
||
(char-to-string
|
||
(string-to-number
|
||
;; Skip the `#'
|
||
(substring ss 1))))))))
|
||
(replace-regexp-in-string "&[^; ]*;" get-function string)))
|
||
|
||
|
||
;;;; Convenience Functions
|
||
|
||
(defun sx-encoding-normalize-line-endings (string)
|
||
"Normalize the line endings for STRING.
|
||
The API returns strings that use Windows-style line endings.
|
||
These are largely useless in an Emacs environment. Windows uses
|
||
\"\\r\\n\", Unix uses just \"\\n\". Deleting \"\\r\" is sufficient for
|
||
conversion."
|
||
(delete ?\r string))
|
||
|
||
(defun sx-encoding-clean-content (string)
|
||
"Clean STRING for display.
|
||
Applies `sx-encoding-normalize-line-endings' and
|
||
`sx-encoding-decode-entities' (in that order) to prepare STRING
|
||
for sane display."
|
||
(sx-encoding-decode-entities
|
||
(sx-encoding-normalize-line-endings
|
||
string)))
|
||
|
||
(defun sx-encoding-clean-content-deep (data)
|
||
"Clean DATA recursively where necessary.
|
||
|
||
If DATA is a list or a vector, map this function over DATA and
|
||
return as the the same type of structure.
|
||
|
||
If DATA is a cons cell (but not a list), use
|
||
`sx-encoding-clean-content-deep' on the `cdr' of DATA.
|
||
|
||
If DATA is a string, return DATA after applying
|
||
`sx-encoding-clean-content'.
|
||
|
||
Otherwise, return DATA.
|
||
|
||
This function is highly specialized for the data structures
|
||
returned by `json-read' via `sx-request-make'. It may fail in
|
||
some cases."
|
||
(if (consp data)
|
||
(if (listp (cdr data))
|
||
(cl-map #'list #'sx-encoding-clean-content-deep data)
|
||
(cons (car data) (sx-encoding-clean-content-deep (cdr data))))
|
||
(cond
|
||
((stringp data)
|
||
(sx-encoding-clean-content data))
|
||
((vectorp data)
|
||
(cl-map #'vector #'sx-encoding-clean-content-deep data))
|
||
(t data))))
|
||
|
||
|
||
;;;; GZIP
|
||
|
||
(defun sx-encoding-gzipped-p (data)
|
||
"Check for magic bytes in DATA.
|
||
Check if the first two bytes of a string in DATA match the magic
|
||
numbers identifying the gzip file format.
|
||
|
||
See URL `http://www.gzip.org/zlib/rfc-gzip.html'."
|
||
;; Credit: http://emacs.stackexchange.com/a/2978
|
||
(equal (substring (string-as-unibyte data) 0 2)
|
||
(unibyte-string 31 139)))
|
||
|
||
(defun sx-encoding-gzipped-buffer-p (buffer)
|
||
"Check if BUFFER is gzip-compressed.
|
||
See `sx-encoding-gzipped-p'."
|
||
(with-current-buffer buffer
|
||
(sx-encoding-gzipped-p
|
||
(buffer-string))))
|
||
|
||
(defun sx-encoding-gzipped-file-p (file)
|
||
"Check if the FILE is gzip-compressed.
|
||
See `sx-encoding-gzipped-p'."
|
||
(let ((first-two-bytes (with-temp-buffer
|
||
(set-buffer-multibyte nil)
|
||
(insert-file-contents-literally file nil 0 2)
|
||
(buffer-string))))
|
||
(sx-encoding-gzipped-p first-two-bytes)))
|
||
|
||
(provide 'sx-encoding)
|
||
;;; sx-encoding.el ends here
|
||
|
||
;; Local Variables:
|
||
;; indent-tabs-mode: nil
|
||
;; End:
|