Remove dependency on HTML tidy - squeeze - A static site generator that can put the toothpaste back in the tube.

commit 54b6aa5aa07cbdc3c7f5b229f5f978b67449ff96
parent fc716527fb66779944f370041b16aec34795224a
Author: St John Karp <contact@stjo.hn>
Date:   Sun,  9 Feb 2020 17:57:32 -0600

Remove dependency on HTML tidy

Removed HTML tidy from the the shell script. Not only does this
remove a dependency, it also removes the only asymmetrical part
of the generate/ungenerate process. This was preventing the script
from being properly reversible. Now (and with supporting changes
to the HTML template) the script runs both ways on the same site
and generates reproducible output each time.

Diffstat:
M helpers.pl  | 12 ++++++++----
M html.pl  | 70 +++++++++++++++++++++++++++++++++++-----------------------------------
M readme.md  | 11 +++++------
M rss.pl  | 36 ++++++++++++++++++------------------
M tastic.sh  | 7 +++----

5 files changed, 69 insertions(+), 67 deletions(-)
diff --git a/helpers.pl b/helpers.pl
@@ -58,8 +58,12 @@ anything([X|Rest]) --> [X], anything(Rest).
 
 whitespace --> [].
 
-whitespace --> "\n", whitespace.
+whitespace --> newline, whitespace.
 
-whitespace --> "\t", whitespace.
+whitespace --> tab, whitespace.
 
-whitespace --> " ", whitespace.
-\ No newline at end of file
+whitespace --> " ", whitespace.
+
+newline --> "\n".
+
+tab --> "\t".
+\ No newline at end of file
diff --git a/html.pl b/html.pl
@@ -6,64 +6,64 @@
 
 page(Entry, Title, Subtitle, Date) -->
 	doctype,
-	whitespace,
+	newline,
 	html(Entry, Title, Subtitle, Date).
 
 html(Entry, Title, Subtitle, Date) -->
 	html_open,
-	whitespace,
+	newline,
 	head(Title),
-	whitespace,
+	newline,
 	body(Entry, Title, Subtitle, Date),
-	whitespace,
+	newline,
 	html_close.
 
 head(Title) -->
 	head_open,
-	whitespace,
+	newline, tab,
 	title(Title),
-	whitespace,
+	newline, tab,
 	meta,
-	whitespace,
+	newline, tab,
 	styles,
-	whitespace,
+	newline, tab,
 	rss,
-	whitespace,
+	newline,
 	head_close.
 
 body(Entry, Title, Subtitle, Date) -->
 	body_open,
-	whitespace,
+	newline, tab,
 	header(Title),
-	whitespace,
+	newline, tab,
 	article(Entry, Title, Subtitle, Date),
-	whitespace,
+	newline, tab,
 	entry_utility,
-	whitespace,
+	newline, tab,
 	footer,
-	whitespace,
+	newline,
 	body_close.
 
 header(Title) -->
 	header_open,
-	whitespace,
+	newline, tab, tab,
 	header_title(Title),
-	whitespace,
+	newline, tab, tab,
 	header_subtitle,
-	whitespace,
+	newline, tab,
 	header_close.
 
 article(Entry, Title, Subtitle, Date) -->
 	article_open,
-	whitespace,
+	newline, tab, tab,
 	article_header(Title, Subtitle, Date),
-	whitespace,
+	newline, tab, tab,
 	div_entry_open,
-	whitespace,
+	newline,
 	anything(Entry),
-	whitespace,
+	newline, tab, tab,
 	div_entry_close,
-	whitespace,
+	newline, tab,
 	article_close,
 	{ [First|_] = Entry,  char_code('<', First) }.
 
@@ -77,41 +77,41 @@ article_header(Title, null, null) -->
 % An article without a subtitle.
 article_header(Title, null, Date) -->
 	article_title(Title),
-	whitespace,
+	newline, tab, tab,
 	article_meta(Date).
 
 % An article without metadata.
 article_header(Title, Subtitle, null) -->
 	article_title(Title),
-	whitespace,
+	newline, tab, tab,
 	article_subtitle(Subtitle).
 
 % An article with all header components.
 article_header(Title, Subtitle, Date) -->
 	article_title(Title),
-	whitespace,
+	newline, tab, tab,
 	article_subtitle(Subtitle),
-	whitespace,
+	newline, tab, tab,
 	article_meta(Date).
 
 footer -->
 	footer_open,
-	whitespace,
+	newline, tab, tab,
 	p_center_open,
-	whitespace,
+	newline, tab, tab, tab,
 	license_link,
-	whitespace,
+	newline, tab, tab, tab,
 	br,
-	whitespace,
+	newline, tab, tab, tab,
 	license_text,
-	whitespace,
+	newline, tab, tab,
 	p_close,
-	whitespace,
+	newline, tab,
 	footer_close.
 
 doctype --> "<!DOCTYPE html>".
 
-html_open --> "<html lang=\"en\">".
+html_open --> "<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">".
 
 head_open --> "<head>".
 
@@ -195,14 +195,14 @@ article_subtitle(ArticleSubtitle) -->
 
 article_meta(ArticleDate) -->
 	"<div class=\"entry-meta\">",
-	whitespace,
+	newline, tab, tab, tab,
 	"<time datetime=\"",
 	anything(ArticleDate),
 	anything(_),
 	"\">",
 	anything(ArticleDate),
 	"</time>",
-	whitespace,
+	newline, tab, tab,
 	"</div><!-- .entry-meta -->".
 
 div_entry_open --> "<div class=\"entry-content\">".
diff --git a/readme.md b/readme.md
@@ -6,15 +6,14 @@ A static site generator in Prolog (mostly).
 
 A few months ago I lost the source files I used to generate my static website. Fortunately there was no irreparable data loss because I still had the generated site up on my server. The problem was now I needed to write a script that would extract all the articles into source files again, and then reconfigure the site generator. Then I went, "Oh. This is a Prolog problem." I figured if I could write a Prolog program that described my HTML template then I could use the same code both to un-generate and re-generate the website, because a Prolog program is basically a set of rules and the logic can be run in either direction. (But then I love Prolog so every problem is a Prolog problem but I don't care. Fight me.)
 
-So the skinny is I wound up writing my own static website generator in Prolog. Well, the main components are in Prolog. I also wrote a bash script to make use of a bunch of common \*nix utilities (find, sed, grep, etc.) and to pipe output to some third-party programs where I needed them (HTML Tidy and it's still TBD, but possibly Pandoc in the future). Weirdest bit was that I just couldn't find anything decent to generate RSS feeds. I considered dropping the RSS all together, but I've spent enough time haranguing people for not supporting interoperable standards that I didn't want to be a hypocrite. I wound up writing my own RSS generator too, also in Prolog.
+So the skinny is I wound up writing my own static website generator in Prolog. Well, the main components are in Prolog. I also wrote a bash script to make use of a bunch of common \*nix utilities (find, sed, grep, etc.) and to pipe output to some third-party programs where I needed them (Smartypants, and it's still TBD but possibly Pandoc in the future). Weirdest bit was that I just couldn't find anything decent to generate RSS feeds. I considered dropping the RSS all together, but I've spent enough time haranguing people for not supporting interoperable standards that I didn't want to be a hypocrite. I wound up writing my own RSS generator too, also in Prolog.
 
-It's all reeeeeally alpha and is pretty closely tailored to my specific needs, but it works, and IMHO it works better than my old site generator which injected a bunch of nonsense into my HTML.
+It's pretty closely tailored to my specific needs, but it works, and IMHO it works better than my old site generator which injected a bunch of nonsense into my HTML. To make this work for your site, all you need to do is define the template of your website in "html.pl".
 
 ## Dependencies
 
 * Bash. Used to run the script that automates everything else.
-* A Prolog interpreter. Tested with [SWI-Prolog](https://www.swi-prolog.org/), but the syntax should be vanilla ISO Prolog and should work with any implementation.
-* [HTML Tidy](http://www.html-tidy.org/). Used to format the HTML output nicely — not something I relished doing in Prolog.
+* A Prolog interpreter. Tested with [SWI-Prolog](https://www.swi-prolog.org/), but the syntax aims to be vanilla ISO Prolog and should work with any implementation.
 * [Smartypants](https://github.com/leohemsted/smartypants.py). Used to smarten the punctuation in the HTML output.
 
 ## Assumptions
@@ -23,9 +22,9 @@ The website folder used in the second argument is expected to contain three thin
 
 * a "source" folder containing the website's source;
 * an "output" folder containing the website's static output;
-* a "site.pl" file containing site-specific definitions.
+* and a "site.pl" file containing site-specific definitions.
 
-One or the other of the "source" and "output" folders must be populated, but not necessarily both.
+One or the other of the "source" and "output" folders must be populated, but not necessarily both. In the case of saving a website for which you'd lost the source code, you'd populate "output", ungenerate the site, then commit the contents of "source" to version control.
 
 site.pl contains DCG definitions of this site's specifics, such as title, author, etc. An example site.pl file might look like this:
 
diff --git a/rss.pl b/rss.pl
@@ -5,39 +5,38 @@
 
 rss(BuildDate, Articles) -->
 	rss_open,
-	"\n",
+	newline, tab, tab,
 	channel_meta(BuildDate),
-	"\n",
 	items(Articles),
-	"\n",
+	newline, tab,
 	rss_close.
 
 rss_open -->
 	"<?xml version=\"1.0\" encoding=\"UTF-8\" ?>",
-	"\n",
+	newline,
 	"<rss version=\"2.0\">",
-	"\n",
+	newline, tab,
 	"<channel>".
 
 channel_meta(BuildDate) -->
 	"<title>",
 	site_title,
 	"</title>",
-	"\n",
+	newline, tab, tab,
 	"<description>",
 	site_subtitle,
 	"</description>",
-	"\n",
+	newline, tab, tab,
 	"<link>",
 	site_url,
 	"</link>",
-	"\n",
+	newline, tab, tab,
 	language,
-	"\n",
+	newline, tab, tab,
 	copyright,
-	"\n",
+	newline, tab, tab,
 	webmaster,
-	"\n",
+	newline, tab, tab,
 	last_build_date(BuildDate).
 
 title(Title) -->
@@ -80,18 +79,19 @@ items([]) --> [].
 items([First|Rest]) --> item(First), items(Rest).
 
 item(article(Date, Title, Link, Description)) -->
+	newline, tab, tab,
 	item_open,
-	"\n",
+	newline, tab, tab, tab,
 	title(Title),
-	"\n",
+	newline, tab, tab, tab,
 	link(Link),
-	"\n",
+	newline, tab, tab, tab,
 	description(Description),
-	"\n",
+	newline, tab, tab, tab,
 	author,
-	"\n",
+	newline, tab, tab, tab,
 	pubdate(Date),
-	"\n",
+	newline, tab, tab,
 	item_close.
 
 item_open --> "<item>".
@@ -110,5 +110,5 @@ item_close --> "</item>".
 
 rss_close -->
 	"</channel>",
-	"\n",
+	newline,
 	"</rss>".
diff --git a/tastic.sh b/tastic.sh
@@ -29,7 +29,8 @@ then
 		done
 
 	# Copy anything else directly.
-	find "$SITE_PATH"/"$OUTPUT_DIR" -type f -not -name "*.html" -print0 |
+	# Excludes the RSS folder, which we create ourselves upon generation.
+	find "$SITE_PATH"/"$OUTPUT_DIR" -path "$SITE_PATH"/"$OUTPUT_DIR"/feeds -prune -o -type f -not -name "*.html" -print0 |
 		while IFS= read -r -d '' file; do
 			NEW_PATH=`echo "$file" | sed "s|^$SITE_PATH/$OUTPUT_DIR|$SITE_PATH/$SOURCE_DIR|"`
 			cp "$file" "$NEW_PATH"
@@ -49,7 +50,6 @@ then
 			NEW_PATH=`echo "$file" | sed "s|^$SITE_PATH/$SOURCE_DIR|$SITE_PATH/$OUTPUT_DIR|" | sed 's|.md$|.html|'`
 			cat "$file" |
 				swipl --traditional -q -l parse_entry.pl -g "consult('$SITE_PATH/site.pl'), generate_entry." |
-				tidy5 -quiet --indent auto --indent-with-tabs yes --wrap 0 -asxml --tidy-mark no |
 				smartypants \
 				> "$NEW_PATH"
 		done
@@ -65,8 +65,7 @@ then
 	mkdir -p "$SITE_PATH"/"$OUTPUT_DIR"/feeds
 	ARTICLES=`grep -Rl --include=\*.md "^Date: " "$SITE_PATH"/"$SOURCE_DIR" | paste -sd ',' - | sed "s|,|','|g"`
 	BUILD_DATE=`date +"%Y-%m-%d %T"`
-	swipl --traditional -q -l generate_rss.pl -g "consult('$SITE_PATH/site.pl'), generate_rss(\"$BUILD_DATE\", ['$ARTICLES'])." |
-		tidy5 -quiet --indent auto --indent-with-tabs yes --wrap 0 -xml --tidy-mark no \
+	swipl --traditional -q -l generate_rss.pl -g "consult('$SITE_PATH/site.pl'), generate_rss(\"$BUILD_DATE\", ['$ARTICLES'])." \
 		> "$SITE_PATH"/"$OUTPUT_DIR"/feeds/rss.xml
 else
 	echo "Invalid argument."

	squeeze A static site generator that can put the toothpaste back in the tube.
	git clone https://git.stjo.hn/squeeze
	Log \| Files \| Refs \| README \| LICENSE

M	helpers.pl	\|	12	++++++++----
M	html.pl	\|	70	+++++++++++++++++++++++++++++++++++-----------------------------------
M	readme.md	\|	11	+++++------
M	rss.pl	\|	36	++++++++++++++++++------------------
M	tastic.sh	\|	7	+++----