# This gets the title of a web page by making an HTTP connection and parsing the result for a tag. subroutine(get_title_by_http_or_https(string hostname, string uri, bool https), ( # Connect to the HTTP server int port = if (https) then 443 else 80; if (matches_regular_expression(hostname, '^([^:]+):([0-9]+)$')) then ( hostname = $1; port = $2; # echo("Split hostname=" . hostname . "; port=" . port); ); string socket = connect(hostname, port, https); # echo("connected: " . socket); # Send a GET request to get the specified URI data d = "GET " . uri . " HTTP/1.1\r\nHost: " . hostname . "\r\n\r\n"; # data d = "GET /faq.html HTTP/1.1\r\nHost: " . hostname . "\r\n\r\n"; write_to_socket(socket, d, length(d)); # echo("wrote " . d); # Read the result in the following loop int read = 1; int contentLength = -1; string totalResponse; string headers; string body; bool done = false; bool inHeaders = true; bool chunked = false; int chunkLength; string chunkedBody; string chunk; string title; while (!done) ( # echo("Reading from socket"); read_from_socket(socket, d, 10000); if (length(d) == 0) then done = true; # echo(">>> Read " . length(d) . " bytes from socket"); if (contains(d, "</html>")) then done = true; # echo("DATA: " . d); # If we haven't found the end of the headers yet, look again if (inHeaders) then ( # Build the total response by adding this chunk totalResponse .= d; int blankLinePos = index(totalResponse, '\r\n\r\n'); # echo("blankLinePos: " . blankLinePos); if (blankLinePos != -1) then ( # echo("found blank line at " . blankLinePos); headers = substr(totalResponse, 0, blankLinePos); body = substr(totalResponse, blankLinePos + 4); totalResponse = substr(totalResponse, blankLinePos); # echo(">>>>>>>>>> START HEADERS\n" . headers . "\n<<<<<<<< END HEADERS"); # echo(">>>>>>>>>> START BODY\n" . body . "\n<<<<<<<< END BODY"); if (matches_regular_expression(headers, '\r\nTransfer-Encoding: chunked')) then ( chunked = true; chunkedBody = body; ); inHeaders = false; ); ); # Still reading the body else ( # echo("Chunked: " . chunked); # if chunked if (chunked) then ( # Add this block to the chunked body chunkedBody .= d; # Extract all the chunks we can from what we have bool extractedAllChunksSoFar = false; while (!extractedAllChunksSoFar) ( # echo("Extracting another chunk from what we have; extractedAllChunksSoFar=" . extractedAllChunksSoFar); # echo(">>>>>>>>>> START CHUNKEDBODY\n" . chunkedBody . "\n<<<<<<<< END CHUNKEDBODY"); # If there's nothing, or only whitespace, read some more. if ((chunkedBody eq "") or (matches_regular_expression(chunkedBody, "^[ \r\n]+$"))) then extractedAllChunksSoFar = true; # Extract the chunk length and the remainder of the chunk if (matches_regular_expression(chunkedBody, '^([0-9A-Fa-f]+)\r\n')) then ( chunkLength = convert_base($1, 16, 10); # echo("chunkLength=" . chunkLength); # If the chunk length is 0, we're done if ($1 == 0) then ( done = true; extractedAllChunksSoFar = true; ); # If we don't have a full chunk, we need to read more else if (length(chunkedBody) - length($1) < chunkLength) then ( # echo("Not enough chunk body yet (have " . length(chunkedBody) . "; need " . chunkLength . "; getting more"); extractedAllChunksSoFar = true; ); # If we now have a full chunk of data in the chunked body, add that chunk to the body else ( chunk = substr(chunkedBody, length($1) + 2, chunkLength); chunkedBody = substr(chunkedBody, length($1) + chunkLength + 2 + 2); body .= chunk; # chunkedBody = substr(chunkedBody, $1); # echo("CHUNK is: <<<<<<<\n" . chunk . "\n<<<< END CHUNK\n"); # echo("Now BODY is: <<<<<<<\n" . body . "\n<<<< END NOW BODY\n"); # echo("Now chunkedBody is: <<<<<<<\n" . chunkedBody . "\n>>>>>>> END NOW CHUNKEDBODY\n"); ); ); # if chunk length matched ); # while !readMore ); # if chunked # if not chunked else ( # Add this piece to the body body .= d; ); # if not chunked # Check what we have so far, to see if there's a <title> tag if (matches_regular_expression(body, '<[Tt][Ii][Tt][Ll][Ee]>([^>]+)<')) then ( # echo("FOUND TITLE:" . $1); done = true; title = $1; ); ); # if reading body # if (!gotHeaders) then ( # if # ); # echo(d); # if (contains(lowercase(totalResponse), '<title>')) then ( ); # while not done # echo("done: " . done); # echo(">>>>>>>>>> START BODY\n" . body . "\n<<<<<<<< END BODY"); disconnect(socket); #echo("Disconnecting from socket=" . socket); title; )); # subroutine get_title_by_http_or_https subroutine(get_title_by_http(string hostname, string uri), ( get_title_by_http_or_https(hostname, uri, false); )); subroutine(get_title_by_https(string hostname, string uri), ( get_title_by_http_or_https(hostname, uri, true); ));