Search notes:

download-content-of-linked-pages.pl

download-content-of-linked-pages.pl is a simple (that is: rudimentary) Perl script that uses HTML::LinkExtractor and LWP::Simple to fetch the pages that are linked at in the document of a given URL.
It is executed from the command line with
$ download-content-of-linked-pages.pl https://some-host.xyz/path/to/some/file.html
#!/usr/bin/perl
use warnings;
use strict;

use HTML::LinkExtractor;
use LWP::Simple qw(get);
use File::Path qw(make_path);
use URI;

my $url = $ARGV[0] or die "No url specified";

my $html = get($url) or die;

my $extractor = HTML::LinkExtractor->new(\&link_found, $url);

$extractor -> parse(\$html);

sub link_found {
  my $link = $_[1];

  return unless $link->{tag} eq 'a';

  my $href=$link->{href};

  $href .= 'index' if substr($href, -1) eq '/'; # Add index to link if it ends in a slash...

  my $uri = URI->new($href);


  my $host      = $uri->host;
  my $full_path = "downloads/$host". $uri->path;
  $full_path .= '?' . $uri->query if $uri->query;

  print "href = $href\nhost = $host\nfull_path= $full_path\n";

  $full_path =~ s/#[^#]*$//;

  return if -e $full_path;

 (my $dirname ) = $full_path =~ m!(.*)/[^/]+$!;

  print "$href  -> $host -> $full_path -> $dirname\n";
  make_path $dirname unless -d $dirname;

  open my $out , '>', $full_path or die "$full_path\n$!";
  print $out (get($href));
  close $out;

  printf "Downloaded %-100s to %s\n", $href, $full_path;
}
Github repository PerlModules, path: /HTML/LinkExtractor/download-content-of-linked-pages.pl

Index

Fatal error: Uncaught PDOException: SQLSTATE[HY000]: General error: 8 attempt to write a readonly database in /home/httpd/vhosts/renenyffenegger.ch/php/web-request-database.php:78 Stack trace: #0 /home/httpd/vhosts/renenyffenegger.ch/php/web-request-database.php(78): PDOStatement->execute(Array) #1 /home/httpd/vhosts/renenyffenegger.ch/php/web-request-database.php(30): insert_webrequest_('/notes/developm...', 1759398087, '216.73.216.42', 'Mozilla/5.0 App...', NULL) #2 /home/httpd/vhosts/renenyffenegger.ch/httpsdocs/notes/development/languages/Perl/modules/HTML/LinkExtractor/download-content-of-linked-pages_pl(89): insert_webrequest() #3 {main} thrown in /home/httpd/vhosts/renenyffenegger.ch/php/web-request-database.php on line 78