The OpenD Programming Language

1 /++
2 	Converts HTML to plain text. Can also output VT escape sequences for terminal output.
3 
4 	The exact output of this is subject to change - it is just what appears nice for me. (I actually use this on my personal email setup.)
5 +/
6 module arsd.htmltotext;
7 
8 import arsd.dom;
9 import arsd.color;
10 import std.string;
11 
12 import std.uni : isWhite;
13 
14 ///
15 class HtmlConverter {
16 	int width;
17 
18 	/++
19 		Will enable color output using VT codes. Determines color through dom.d's css support, which means you need to apply a stylesheet first.
20 
21 		---
22 		import arsd.dom;
23 
24 		auto document = new Document(source_code_for_html);
25 		auto stylesheet = new Stylesheet(source_code_for_css);
26 		stylesheet.apply(document);
27 		---
28 	+/
29 	bool enableVtOutput;
30 
31 
32 	string color;
33 	string backgroundColor;
34 
35 	///
36 	void htmlToText(Element element, bool preformatted, int width) {
37 		string color, backgroundColor;
38 		if(enableVtOutput) {
39 			color = element.computedStyle.getValue("color");
40 			backgroundColor = element.computedStyle.getValue("background-color");
41 		}
42 
43 		string originalColor = this.color, originalBackgroundColor = this.backgroundColor;
44 
45 		this.color = color.length ? color : this.color;
46 		this.backgroundColor = backgroundColor.length ? backgroundColor : this.backgroundColor;
47 
48 		scope(exit) {
49 			// the idea is as we pop working back up the tree, it restores what it was here
50 			this.color = originalColor;
51 			this.backgroundColor = originalBackgroundColor;
52 		}
53 
54 
55 		this.width = width;
56 		if(auto tn = cast(TextNode) element) {
57 			foreach(dchar ch; tn.nodeValue) {
58 				sink(ch, preformatted);
59 			}
60 		} else {
61 			void sinkChildren() {
62 				foreach(child; element.childNodes)
63 					htmlToText(child, preformatted, width);
64 			}
65 			switch(element.tagName) {
66 				case "head", "script", "style":
67 					// intentionally blank
68 				break;
69 				// The table stuff is removed right now because while it looks
70 				// ok for test tables, it isn't working well for the emails I have
71 				// - it handles data ok but not really nested layouts.
72 				case "trlol":
73 					auto children = element.childElements;
74 
75 					auto tdWidth = (width - cast(int)(children.length)*3) / cast(int)(children.length);
76 					if(tdWidth < 12) {
77 						// too narrow to be reasonable
78 						startBlock();
79 						sinkChildren();
80 						endBlock();
81 					} else {
82 						string[] tdBlocks;
83 						int longestBlock;
84 						foreach(child; children) {
85 							auto fmt = new HtmlConverter();
86 
87 							fmt.htmlToText(child, false, tdWidth);
88 							tdBlocks ~= fmt.s;
89 							int lineCount = 1;
90 							foreach(ch; fmt.s)
91 								if(ch == '\n')
92 									lineCount++;
93 							if(lineCount > longestBlock)
94 								longestBlock = lineCount;
95 						}
96 
97 						if(s.length && s[$-1] != '\n')
98 							s ~= '\n';
99 						foreach(lineNumber; 0 .. longestBlock) {
100 							foreach(bidx, ref block; tdBlocks) {
101 								auto ob = block;
102 								if(bidx)
103 									s ~= " | ";
104 								if(block.length) {
105 									auto idx = block.indexOf("\n");
106 									if(idx == -1)
107 										idx = block.length;
108 
109 									s ~= block[0 .. idx];
110 
111 									if(idx == block.length)
112 										block = block[$..$];
113 									else
114 										block = block[idx + 1 .. $];
115 								}
116 
117 								if(ob.length < tdWidth)
118 								foreach(a; 0 .. tdWidth - block.length)
119 									s ~= " ";
120 
121 							}
122 							s ~= "\n";
123 						}
124 
125 						foreach(a; 0 .. children.length) {
126 							foreach(w; 0 .. tdWidth) {
127 								s ~= "-";
128 							}
129 							if(a +1 != children.length)
130 								s ~= "-+-";
131 						}
132 						s ~= "\n";
133 					}
134 				break;
135 				case "tr":
136 					startBlock(2);
137 					sinkChildren();
138 					endBlock();
139 				break;
140 				case "td":
141 					startBlock(0);
142 					sinkChildren();
143 					endBlock();
144 				break;
145 				case "a":
146 					sinkChildren();
147 					if(element.href != element.innerText) {
148 						sink(' ', false);
149 						sink('<', false);
150 						// I want the link itself to NOT word wrap
151 						// to make for easier double-clicking of it in
152 						// the terminal
153 						foreach(dchar ch; element.href)
154 							sink(ch, false, int.max);
155 						sink('>', false);
156 					}
157 				break;
158 				case "span":
159 					if(enableVtOutput) {
160 						auto csc = color; // element.computedStyle.getValue("color");
161 						if(csc.length) {
162 							auto c = Color.fromString(csc);
163 							s ~= format("\033[38;2;%d;%d;%dm", c.r, c.g, c.b);
164 						}
165 
166 						bool bold = element.computedStyle.getValue("font-weight") == "bold";
167 
168 						if(bold)
169 							s ~= "\033[1m";
170 
171 						sinkChildren();
172 
173 						if(bold)
174 							s ~= "\033[0m";
175 						if(csc.length)
176 							s ~= "\033[39m";
177 					} else {
178 						sinkChildren();
179 					}
180 				break;
181 				case "p":
182 					startBlock();
183 					sinkChildren();
184 					endBlock();
185 				break;
186 				case "b", "strong":
187 				case "em", "i":
188 					if(element.innerText.length == 0)
189 						break;
190 					if(enableVtOutput) {
191 						s ~= "\033[1m";
192 						sinkChildren();
193 						s ~= "\033[0m";
194 					} else {
195 						sink('*', false);
196 						sinkChildren();
197 						sink('*', false);
198 					}
199 				break;
200 				case "u":
201 					if(element.innerText.length == 0)
202 						break;
203 					sink('_', false);
204 					sinkChildren();
205 					sink('_', false);
206 				break;
207 				case "ul":
208 					ulDepth++;
209 					startBlock(2);
210 					sinkChildren();
211 					endBlock();
212 					ulDepth--;
213 				break;
214 				case "ol":
215 					olDepth++;
216 					startBlock(2);
217 					sinkChildren();
218 					endBlock();
219 					olDepth--;
220 				break;
221 				case "li":
222 					startBlock();
223 
224 					//sink('\t', true);
225 					/*
226 					foreach(cnt; 0 .. olDepth + ulDepth) {
227 						sink(' ', true);
228 						sink(' ', true);
229 					}
230 					*/
231 					if(olDepth)
232 						sink('*', false);
233 					if(ulDepth)
234 						sink('*', false);
235 					sink(' ', true);
236 
237 					sinkChildren();
238 
239 					endBlock();
240 				break;
241 
242 				case "dl":
243 				case "dt":
244 				case "dd":
245 					startBlock(element.tagName == "dd" ? 2 : 0);
246 					sinkChildren();
247 					endBlock();
248 				break;
249 
250 				case "h1":
251 					startBlock();
252 					sink('#', true);
253 					sink('#', true);
254 					sink(' ', true);
255 					sinkChildren();
256 					sink(' ', true);
257 					sink('#', true);
258 					sink('#', true);
259 					endBlock();
260 				break;
261 				case "h2", "h3":
262 					startBlock();
263 					sinkChildren();
264 					sink('\n', true);
265 					foreach(dchar ch; element.innerText)
266 						sink(element.tagName == "h2" ? '=' : '-', false);
267 					endBlock();
268 				break;
269 				case "hr":
270 					startBlock();
271 					foreach(i; 0 .. width / 4)
272 						sink(' ', true);
273 					foreach(i; 0 .. width / 2)
274 						sink('-', false);
275 					endBlock();
276 				break;
277 
278 				case "br":
279 					sink('\n', true);
280 				break;
281 				case "div":
282 					startBlock();
283 
284 					/*
285 					auto csc = element.computedStyle.getValue("background-color");
286 					if(csc.length) {
287 						auto c = Color.fromString(csc);
288 						s ~= format("\033[48;2;%d;%d;%dm", c.r, c.g, c.b);
289 					}
290 					*/
291 
292 					sinkChildren();
293 
294 					/*
295 					if(csc.length)
296 						s ~= "\033[49m";
297 					*/
298 
299 					endBlock();
300 				break;
301 				case "pre":
302 					startBlock(4);
303 					foreach(child; element.childNodes)
304 						htmlToText(child, true, width);
305 					endBlock();
306 				break;
307 				default:
308 					sinkChildren();
309 			}
310 		}
311 	}
312 
313 	int olDepth;
314 	int ulDepth;
315 
316 	///
317 	string convert(string html, bool wantWordWrap = true, int wrapAmount = 74) {
318 		Document document = new Document;
319 
320 		document.parse("<roottag>" ~ html ~ "</roottag>");
321 
322 		Element start;
323 		auto bod = document.getElementsByTagName("body");
324 		if(bod.length)
325 			start = bod[0];
326 		else
327 			start = document.root;
328 
329 		//import std.file;
330 		//auto stylesheet = new StyleSheet(readText("/var/www/dpldocs.info/experimental-docs/style.css"));
331 		//stylesheet.apply(document);
332 
333 		return convert(start, wantWordWrap, wrapAmount);
334 	}
335 
336 	///
337 	string convert(Element start, bool wantWordWrap = true, int wrapAmount = 74) {
338 		htmlToText(start, false, wrapAmount);
339 		return s;
340 	}
341 
342 	///
343 	void reset() {
344 		s = null;
345 		justOutputWhitespace = true;
346 		justOutputBlock = true;
347 		justOutputMargin = true;
348 	}
349 
350 	///
351 	string s;
352 	bool justOutputWhitespace = true;
353 	bool justOutputBlock = true;
354 	bool justOutputMargin = true;
355 	int lineLength;
356 
357 	void sink(dchar item, bool preformatted, int lineWidthOverride = int.min) {
358 
359 		if(needsIndent && item != '\n') {
360 			lineLength += doIndent();
361 			needsIndent = false;
362 		}
363 
364 		int width = lineWidthOverride == int.min ? this.width : lineWidthOverride;
365 		if(!preformatted && isWhite(item)) {
366 			if(!justOutputWhitespace) {
367 				item = ' ';
368 				justOutputWhitespace = true;
369 			} else {
370 				return;
371 			}
372 		} else {
373 			// if it is preformatted, we still need to keep track of if it is whitespace
374 			// so stuff like <br> is somewhat sane
375 			justOutputWhitespace = preformatted && isWhite(item);
376 		}
377 
378 		s ~= item;
379 
380 		if(lineLength >= width) {
381 			// rewind to the nearest space, if there is one, to break on a word boundary
382 			int c =  lineLength;
383 			bool broken;
384 			foreach_reverse(idx, char ch; s) {
385 				if(ch == '\n')
386 					break;
387 				if(ch == ' ') {
388 					auto os = s;
389 					s = os[0 .. idx];
390 					s ~= '\n';
391 					lineLength = cast(int)(os[idx+1..$].length);
392 					lineLength += doIndent();
393 					s ~= os[idx + 1 .. $];
394 					broken = true;
395 					break;
396 				}
397 				c--;
398 				if(c < 5)
399 					break;
400 			}
401 
402 			if(!broken) {
403 				s ~= '\n';
404 				lineLength = 0;
405 				needsIndent = true;
406 				justOutputWhitespace = true;
407 			}
408 
409 		}
410 
411 
412 		if(item == '\n') {
413 			lineLength = 0;
414 			needsIndent = true;
415 		} else
416 			lineLength ++;
417 
418 
419 		if(!justOutputWhitespace) {
420 			justOutputBlock = false;
421 			justOutputMargin = false;
422 		}
423 	}
424 
425 	int doIndent() {
426 		int cnt = 0;
427 		foreach(i; indentStack)
428 			foreach(lol; 0 .. i) {
429 				s ~= ' ';
430 				cnt++;
431 			}
432 		return cnt;
433 	}
434 
435 	int[] indentStack;
436 	bool needsIndent = false;
437 
438 	void startBlock(int indent = 0) {
439 
440 		indentStack ~= indent;
441 
442 		if(!justOutputBlock) {
443 			s ~= "\n";
444 			lineLength = 0;
445 			needsIndent = true;
446 			justOutputBlock = true;
447 		}
448 		if(!justOutputMargin) {
449 			s ~= "\n";
450 			lineLength = 0;
451 			needsIndent = true;
452 			justOutputMargin = true;
453 		}
454 	}
455 	void endBlock() {
456 		if(indentStack.length)
457 			indentStack = indentStack[0 .. $ - 1];
458 
459 		if(!justOutputMargin) {
460 			s ~= "\n";
461 			lineLength = 0;
462 			needsIndent = true;
463 			justOutputMargin = true;
464 		}
465 	}
466 }
467 
468 ///
469 string htmlToText(string html, bool wantWordWrap = true, int wrapAmount = 74) {
470 	auto converter = new HtmlConverter();
471 	return converter.convert(html, wantWordWrap, wrapAmount);
472 }
473