# This gets the title of a web page by making an HTTP connection and parsing the result for a
tag.
subroutine(get_title_by_http_or_https(string hostname, string uri, bool https), (
# Connect to the HTTP server
int port = if (https) then 443 else 80;
if (matches_regular_expression(hostname, '^([^:]+):([0-9]+)$')) then (
hostname = $1;
port = $2;
# echo("Split hostname=" . hostname . "; port=" . port);
);
string socket = connect(hostname, port, https);
# echo("connected: " . socket);
# Send a GET request to get the specified URI
data d = "GET " . uri . " HTTP/1.1\r\nHost: " . hostname . "\r\n\r\n";
# data d = "GET /faq.html HTTP/1.1\r\nHost: " . hostname . "\r\n\r\n";
write_to_socket(socket, d, length(d));
# echo("wrote " . d);
# Read the result in the following loop
int read = 1;
int contentLength = -1;
string totalResponse;
string headers;
string body;
bool done = false;
bool inHeaders = true;
bool chunked = false;
int chunkLength;
string chunkedBody;
string chunk;
string title;
while (!done) (
# echo("Reading from socket");
read_from_socket(socket, d, 10000);
if (length(d) == 0) then
done = true;
# echo(">>> Read " . length(d) . " bytes from socket");
# echo("DATA: " . d);
# If we haven't found the end of the headers yet, look again
if (inHeaders) then (
# Build the total response by adding this chunk
totalResponse .= d;
int blankLinePos = index(totalResponse, '\r\n\r\n');
# echo("blankLinePos: " . blankLinePos);
if (blankLinePos != -1) then (
# echo("found blank line at " . blankLinePos);
headers = substr(totalResponse, 0, blankLinePos);
body = substr(totalResponse, blankLinePos + 4);
totalResponse = substr(totalResponse, blankLinePos);
# echo(">>>>>>>>>> START HEADERS\n" . headers . "\n<<<<<<<< END HEADERS");
# echo(">>>>>>>>>> START BODY\n" . body . "\n<<<<<<<< END BODY");
if (matches_regular_expression(headers, '\r\nTransfer-Encoding: chunked')) then (
chunked = true;
chunkedBody = body;
);
inHeaders = false;
);
);
# Still reading the body
else (
# echo("Chunked: " . chunked);
# if chunked
if (chunked) then (
# Add this block to the chunked body
chunkedBody .= d;
# Extract all the chunks we can from what we have
bool extractedAllChunksSoFar = false;
while (!extractedAllChunksSoFar) (
# echo("Extracting another chunk from what we have; extractedAllChunksSoFar=" . extractedAllChunksSoFar);
# echo(">>>>>>>>>> START CHUNKEDBODY\n" . chunkedBody . "\n<<<<<<<< END CHUNKEDBODY");
# If there's nothing, or only whitespace, read some more.
if ((chunkedBody eq "") or (matches_regular_expression(chunkedBody, "^[ \r\n]+$"))) then
extractedAllChunksSoFar = true;
# Extract the chunk length and the remainder of the chunk
if (matches_regular_expression(chunkedBody, '^([0-9A-Fa-f]+)\r\n')) then (
chunkLength = convert_base($1, 16, 10);
# echo("chunkLength=" . chunkLength);
# If the chunk length is 0, we're done
if ($1 == 0) then (
done = true;
extractedAllChunksSoFar = true;
);
# If we don't have a full chunk, we need to read more
else if (length(chunkedBody) - length($1) < chunkLength) then (
# echo("Not enough chunk body yet (have " . length(chunkedBody) . "; need " . chunkLength . "; getting more");
extractedAllChunksSoFar = true;
);
# If we now have a full chunk of data in the chunked body, add that chunk to the body
else (
chunk = substr(chunkedBody, length($1) + 2, chunkLength);
chunkedBody = substr(chunkedBody, length($1) + chunkLength + 2 + 2);
body .= chunk;
# chunkedBody = substr(chunkedBody, $1);
# echo("CHUNK is: <<<<<<<\n" . chunk . "\n<<<< END CHUNK\n");
# echo("Now BODY is: <<<<<<<\n" . body . "\n<<<< END NOW BODY\n");
# echo("Now chunkedBody is: <<<<<<<\n" . chunkedBody . "\n>>>>>>> END NOW CHUNKEDBODY\n");
);
); # if chunk length matched
); # while !readMore
); # if chunked
# if not chunked
else (
# Add this piece to the body
body .= d;
); # if not chunked
# Check what we have so far, to see if there's a tag
if (matches_regular_expression(body, '<[Tt][Ii][Tt][Ll][Ee]>([^>]+)<')) then (
# echo("FOUND TITLE:" . $1);
done = true;
title = $1;
);
); # if reading body
# if (!gotHeaders) then (
# if
# );
# echo(d);
# if (contains(lowercase(totalResponse), '')) then (
); # while not done
# echo("done: " . done);
# echo(">>>>>>>>>> START BODY\n" . body . "\n<<<<<<<< END BODY");
disconnect(socket);
title;
)); # subroutine get_title_by_http_or_https
subroutine(get_title_by_http(string hostname, string uri), (
get_title_by_http_or_https(hostname, uri, false);
));
subroutine(get_title_by_https(string hostname, string uri), (
get_title_by_http_or_https(hostname, uri, true);
));