;; emacs lisp. emacs 22. ;; 2008-01-03. ;; generate a report of wikipedia links. ;; this program traverse a given dir, visiting every html file, find links to Wikipedia in those files, collect them, and generate a nice html report of these links and the files they are from, then write it to a given file. ;; Xah Lee ;; ∑ http://xahlee.org/ ;;;;---------------------------------- ;;;; user level globle parameters ; the dir to process (setq dirpath "/Users/xah/web/") ;; root-path-char-count is a integer that counts how many chars to take off of a given file's full path, to result as a relative path for the link url ;; e.g. if file path is "/Users/xah/web/emacs/emacs.html", and root-path-char-count is 15, then its url in link would be "emacs/emacs.html". (setq root-path-char-count 15) ;; the file to save the generated report to. (existing file is overwritten) (setq output-file "/Users/xah/web/wikipedia_links.html") ;;;;---------------------------------- ;;;; loading package. global vars. (setq tmpBufName " xahtemp") (require 'find-lisp) ;; create hash table. ;; for each entry, the key is Wikipedia url, and value is a list of file paths. ;; like this: ("Wikipedia url" ("file1" "file2" ...)) (setq wpdata-hash (make-hash-table :test 'equal :size 4000)) ;; a list version of the hash for sorting & report (setq wpdata-list '()) ;; header text for the generated HTML file (setq header-text " Links To Wikipedia from XahLee.org

Links To Wikipedia from XahLee.org

") (setq footer-text "
Page created: 2008-01.
© 2008 by Xah Lee.
\"Xah
") ;;;; ------------------------------- ;;;; subroutines (defun insert-date () "Insert current date." (interactive) (insert (format-time-string "%Y-%m-%d")) ) (defun hash-to-list (hashtable) "Return a list that represent the hashtable." (let (mylist) (maphash (lambda (kk vv) (setq mylist (cons (list kk vv) mylist))) hashtable) mylist)) (defun add-wplink-to-hash (filePath) "Get links in filePath and add it to hash table." (let (url) (insert-file-contents filePath nil nil nil t) (goto-char (point-min)) (while (re-search-forward "href=\"\\(http://..\\.wikipedia\\.org/[^\"]+\\)\">\\([^<]+\\)" nil t) (when (and (match-string 0) ; if url found (not (string-match "=" (match-string 1) )) ; not some history page ) (setq url (match-string 1)) ; set url to matched string ;; if exist in hash, prepend to existing entry, else just add (if (gethash url wpdata-hash) (puthash url (cons filePath (gethash url wpdata-hash)) wpdata-hash) (puthash url (list filePath) wpdata-hash)) )))) (defun prt-each (ele) "print each item. ele is of the form (url (filepath1 filepath2 ...)). Print it like this:
  • : , , ...
  • " (let (wplink files) (setq wplink (car ele)) (setq files (cadr ele)) (insert "
  • ") (insert (wikipedia-url-to-link wplink)) (insert ":") (dolist (x files nil) (insert (concat " " (get-html-file-title x) ","))) (delete-backward-char 1) (insert ".") (insert "
  • \n") ) ) ;;;;------------------------------------ (defun wikipedia-url-to-link (url) "Return the url as html link string.\n Example: http://en.wikipedia.org/wiki/Emacs becomes Emacs↗." (require 'gnus-util) (let ((linktext url)) (setq linktext (gnus-url-unhex-string linktext nil)) (setq linktext (concat (car (last (split-string linktext "/"))) "↗") ) (setq linktext (replace-regexp-in-string "&" "&" linktext)) (setq linktext (replace-regexp-in-string "_" " " linktext)) (concat "" linktext "" ) )) (defun get-html-file-title (fname) "Return html fname's tag's text." (let (x1 x2 linkText) (save-current-buffer (set-buffer (get-buffer-create " tmp8293")) (goto-char (point-min)) (insert-file-contents fname nil nil nil t) (setq x1 (search-forward "<title>")) (search-forward "") (setq x2 (search-backward "<")) (buffer-substring-no-properties x1 x2)))) ;;;; ------------------------------------------------ ;;;; main (when (file-exists-p output-file) (copy-file output-file (concat output-file "~") t) (delete-file output-file) ) ;; get links from files put to hash (save-current-buffer (set-buffer (get-buffer-create tmpBufName)) (mapcar 'add-wplink-to-hash (find-lisp-find-files dirpath "\\.html$")) (setq wpdata-list (hash-to-list wpdata-hash)) (setq wpdata-list (sort wpdata-list (lambda (a b) (string< (downcase (car a)) (downcase (car b)))) ))) ;; print it out in a temp buffer and save to file (switch-to-buffer tmpBufName) (erase-buffer) (insert header-text) (insert "

    This page contains all existing links from XahLee.org to Wikipedia, as of ") (insert-date) (insert ". There are a total of " (number-to-string (length wpdata-list)) " links.

    \n\n") (insert "") (insert footer-text) (write-file output-file) (clrhash wpdata-hash) (setq wpdata-list '())