DEHTML(1)
General Commands Manual
DEHTML(1)
dehtml
—
extract text from HTML
The dehtml
utility extracts text from HTML
documents. Text inside
<title>,
<style>
and
<script>
tags is discarded. Numeric and common named HTML entities are converted.
The arguments are as follows:
-s
- Collapse whitespace outside of
<pre>
tags.
There is no way to extract image alt text.
dehtml.l in git
/* Copyright (C) 2021 June McEnroe <june@causal.agency>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
%option noinput nounput noyywrap
%{
enum Token {
Doctype = 1,
Comment,
TagOpen,
TagClose,
Entity,
Text,
Space,
};
%}
%%
"<!DOCTYPE "[^>]*">" { return Doctype; }
"<!--"([^-]|-[^-]|--[^>])*"-->" { return Comment; }
"</"[^>]*">" { return TagClose; }
"<"[^>]*">" { return TagOpen; }
"&"[^;]*";" { return Entity; }
[^<&[:space:]]+ { return Text; }
[[:space:]]+ { return Space; }
%%
#include <err.h>
#include <locale.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <unistd.h>
#include <wchar.h>
static const struct {
wchar_t ch;
const char *name;
} Entities[] = {
{ L'&', "&" },
{ L'<', "<" },
{ L'>', ">" },
{ L'"', """ },
{ L' ', " " },
{ L'\u00A9', "©" },
{ L'\u00B7', "·" },
{ L'\u00BB', "»" },
{ L'\u200F', "‏" },
{ L'\u2014', "—" },
{ L'\u2191', "↑" },
};
static void entity(void) {
wchar_t ch = 0;
if (yytext[1] == '#') {
if (yytext[2] == 'x') {
ch = strtoul(&yytext[3], NULL, 16);
} else {
ch = strtoul(&yytext[2], NULL, 10);
}
} else {
for (size_t i = 0; i < sizeof(Entities) / sizeof(Entities[0]); ++i) {
if (strcmp(Entities[i].name, yytext)) continue;
ch = Entities[i].ch;
break;
}
}
if (ch) {
printf("%lc", (wint_t)ch);
} else {
warnx("unknown entity %s", yytext);
printf("%s", yytext);
}
}
static bool isTag(const char *tag) {
const char *ptr = &yytext[1];
if (*ptr == '/') ptr++;
size_t len = strlen(tag);
if (strncasecmp(ptr, tag, len)) return false;
ptr += len;
return *ptr == ' ' || *ptr == '>';
}
int main(int argc, char *argv[]) {
setlocale(LC_CTYPE, "");
bool collapse = 0;
for (int opt; 0 < (opt = getopt(argc, argv, "s"));) {
switch (opt) {
break; case 's': collapse = true;
break; default: return 1;
}
}
argc -= optind;
argv += optind;
if (!argc) argc++;
for (int i = 0; i < argc; ++i) {
yyin = (argv[i] ? fopen(argv[i], "r") : stdin);
if (!yyin) err(1, "%s", argv[i]);
bool space = true;
bool discard = false;
bool pre = false;
for (enum Token tok; (tok = yylex());) {
if (tok == TagOpen || tok == TagClose) {
if (isTag("title") || isTag("style") || isTag("script")) {
discard = (tok == TagOpen);
} else if (isTag("pre")) {
pre = (tok == TagOpen);
}
} else if (discard) {
continue;
} else if (tok == Entity) {
entity();
space = false;
} else if (tok == Text) {
printf("%s", yytext);
space = false;
} else if (tok == Space) {
if (collapse && !pre) {
if (space) continue;
printf("%c", yytext[0]);
} else {
printf("%s", yytext);
}
space = true;
}
}
}
}