TITLE(1) General
Commands Manual TITLE(1)
title |
[-v ] [-x
pattern] [url] |
title
fetches HTML page titles over HTTP
and HTTPS. title
scans standard input for URLs and
writes their titles to standard output. If a url
argument is given, title
exits after fetching its
title.
The arguments are as follows:
-x
pattern
- Exclude URLs matching pattern, which is a modern
regular expression. See re_format(7).
-v
- Enable libcurl(3)
verbose output.
mkfifo snarf titles
relay irc.example.org 6697 snarf '#example' <>titles >snarf
title <snarf >titles
title.c in git
/* Copyright (C) 2019 June McEnroe <june@causal.agency>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <curl/curl.h>
#include <err.h>
#include <locale.h>
#include <regex.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <wchar.h>
static regex_t regex(const char *pattern, int flags) {
regex_t regex;
int error = regcomp(®ex, pattern, REG_EXTENDED | flags);
if (!error) return regex;
char buf[256];
regerror(error, ®ex, buf, sizeof(buf));
errx(1, "regcomp: %s: %s", buf, pattern);
}
static const struct Entity {
wchar_t ch;
const char *name;
} Entities[] = {
{ L'"', """ },
{ L'&', "&" },
{ L'<', "<" },
{ L'>', ">" },
{ L'', " " },
};
static wchar_t entity(const char *name) {
for (size_t i = 0; i < sizeof(Entities) / sizeof(Entities[0]); ++i) {
struct Entity entity = Entities[i];
if (strncmp(name, entity.name, strlen(entity.name))) continue;
return entity.ch;
}
if (!strncmp(name, "&#x", 3)) return strtoul(&name[3], NULL, 16);
if (!strncmp(name, "&#", 2)) return strtoul(&name[2], NULL, 10);
return 0;
}
static const char EntityPattern[] = {
"[[:space:]]+|&([[:alpha:]]+|#([[:digit:]]+|x[[:xdigit:]]+));"
};
static regex_t EntityRegex;
static void showTitle(const char *title) {
regmatch_t match = {0};
for (; *title; title += match.rm_eo) {
if (regexec(&EntityRegex, title, 1, &match, 0)) break;
if (title[match.rm_so] != '&') {
printf("%.*s ", (int)match.rm_so, title);
continue;
}
wchar_t ch = entity(&title[match.rm_so]);
if (ch) {
printf("%.*s%lc", (int)match.rm_so, title, (wint_t)ch);
} else {
printf("%.*s", (int)match.rm_eo, title);
}
}
printf("%s\n", title);
}
static CURL *curl;
static bool title;
static struct {
char buf[64 * 1024];
size_t len;
} body;
// HE COMES
static const char TitlePattern[] = "<title>([^<]*)</title>";
static regex_t TitleRegex;
static size_t handleBody(char *buf, size_t size, size_t nitems, void *user) {
(void)user;
size_t len = size * nitems;
size_t cap = sizeof(body.buf) - body.len - 1;
size_t new = (len < cap ? len : cap);
if (title || !new) return len;
memcpy(&body.buf[body.len], buf, new);
body.len += new;
body.buf[body.len] = '\0';
regmatch_t match[2];
if (regexec(&TitleRegex, body.buf, 2, match, 0)) return len;
body.buf[match[1].rm_eo] = '\0';
showTitle(&body.buf[match[1].rm_so]);
title = true;
return len;
}
static CURLcode fetchTitle(const char *url) {
CURLcode code = curl_easy_setopt(curl, CURLOPT_URL, url);
if (code) return code;
curl_easy_setopt(curl, CURLOPT_NOBODY, 1L);
code = curl_easy_perform(curl);
if (code) return code;
char *type;
code = curl_easy_getinfo(curl, CURLINFO_CONTENT_TYPE, &type);
if (code) return code;
if (!type || strncmp(type, "text/html", 9)) return CURLE_OK;
char *dest;
curl_easy_getinfo(curl, CURLINFO_EFFECTIVE_URL, &dest);
dest = strdup(dest);
if (!dest) err(1, "strdup");
code = curl_easy_setopt(curl, CURLOPT_URL, dest);
if (code) return code;
free(dest);
body.len = 0;
title = false;
curl_easy_setopt(curl, CURLOPT_HTTPGET, 1L);
code = curl_easy_perform(curl);
return code;
}
int main(int argc, char *argv[]) {
EntityRegex = regex(EntityPattern, 0);
TitleRegex = regex(TitlePattern, REG_ICASE);
setlocale(LC_CTYPE, "");
setlinebuf(stdout);
CURLcode code = curl_global_init(CURL_GLOBAL_ALL);
if (code) errx(1, "curl_global_init: %s", curl_easy_strerror(code));
curl = curl_easy_init();
if (!curl) errx(1, "curl_easy_init");
static char error[CURL_ERROR_SIZE];
curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, error);
curl_easy_setopt(curl, CURLOPT_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS);
curl_easy_setopt(
curl, CURLOPT_USERAGENT,
"curl/7.54.0 facebookexternalhit/1.1 Twitterbot/1.0"
);
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 3L);
curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, handleBody);
bool exclude = false;
regex_t excludeRegex;
int opt;
while (0 < (opt = getopt(argc, argv, "x:v"))) {
switch (opt) {
break; case 'x': {
exclude = true;
excludeRegex = regex(optarg, REG_NOSUB);
}
break; case 'v': curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L);
break; default: return 1;
}
}
if (optind < argc) {
code = fetchTitle(argv[optind]);
if (!code) return 0;
errx(1, "curl_easy_perform: %s", error);
}
char *buf = NULL;
size_t cap = 0;
regex_t urlRegex = regex("https?://([^[:space:]>\"()]|[(][^)]*[)])+", 0);
while (0 < getline(&buf, &cap, stdin)) {
regmatch_t match = {0};
for (char *ptr = buf; *ptr; ptr += match.rm_eo) {
if (regexec(&urlRegex, ptr, 1, &match, 0)) break;
ptr[match.rm_eo] = '\0';
const char *url = &ptr[match.rm_so];
if (!exclude || regexec(&excludeRegex, url, 0, NULL, 0)) {
code = fetchTitle(url);
if (code) warnx("curl_easy_perform: %s", error);
}
ptr[match.rm_eo] = ' ';
}
}
if (ferror(stdin)) err(1, "getline");
}