webcheckout

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 webcheckout - check out repositories referenced on a web page
   6
   7 =head1 SYNOPSIS
   8
   9 B<webcheckout> [options] url [destdir]
  10
  11 =head1 DESCRIPTION
  12
  13 B<webcheckout> downloads an url and parses it, looking for version control
  14 repositories referenced by the page. It checks out each repository into
  15 a subdirectory of the current directory, using whatever VCS program is
  16 appropriate for that repository (git, svn, etc).
  17
  18 The information about the repositories is embedded in the web page using
  19 the rel=vcs-* microformat, which is documented at
  20 <http://kitenet.net/~joey/rfc/rel-vcs/>.
  21
  22 If the optional destdir parameter is specified, VCS programs will be asked
  23 to check out repositories into that directory. If there are multiple
  24 repositories to check out, each will be checked out into a separate
  25 subdirectory of the destdir.
  26
  27 =head1 OPTIONS
  28
  29 =over 4
  30
  31 =item -a, --auth
  32
  33 Prefer authenticated repositories. By default, webcheckout will use
  34 anonymous repositories when possible. If you have an account that
  35 allows you to use authenticated repositories, you might want to use this
  36 option.
  37
  38 =item --no-act, -n
  39
  40 Do not actually check anything out, just print out the commands that would
  41 be run to check out the repositories.
  42
  43 =item --quiet, -q
  44
  45 Quiet mode. Do not print out the commands being run. (The VCS commands
  46 may still be noisy however.)
  47
  48 =back
  49
  50 =head1 PREREQUISITES
  51
  52 To use this program you will need lots of VCS programs installed,
  53 obviously. It also depends on the perl LWP and HTML::Parser modules.
  54
  55 If the perl URI module is installed, webcheckout can heuristically guess
  56 what you mean by partial URLs, such as "kitenet.net/~joey"'
  57
  58 =head1 AUTHOR
  59
  60 Copyright 2009 Joey Hess <joey@kitenet.net>
  61
  62 Licensed under the GNU GPL version 2 or higher.
  63
  64 This program is included in myrepos <http://myrepos.branchable.com/>
  65
  66 =cut
  67
  68 use LWP::Simple;
  69 use HTML::Parser;
  70 use Getopt::Long;
  71 use warnings;
  72 use strict;
  73
  74 # What to download.
  75 my $url;
  76
  77 # Controls whether to print what is being done.
  78 my $quiet=0;
  79
  80 # Controls whether to actually check anything out.
  81 my $noact=0;
  82
  83 # Controls whether to perfer repos that use authentication.
  84 my $want_auth=0;
  85
  86 # Controls where to check out to. If not set, the VCS is allowed to
  87 # decide.
  88 my $destdir;
  89
  90 # how to perform checkouts
  91 my %handlers=(
  92         git => sub { doit("git", "clone", shift, $destdir) },
  93         svn => sub { doit("svn", "checkout", shift, $destdir) },
  94         bzr => sub { doit("bzr", "branch", shift, $destdir) },
  95 );
  96
  97 # Regexps matching urls that are used for anonymous
  98 # repository checkouts. The order is significant:
  99 # urls matching earlier in the list are preferred over
 100 # those matching later.
 101 my @anon_urls=(
 102         qr/^git:\/\//i,
 103         qr/^bzr:\/\//i,
 104         qr/^svn:\/\//i,
 105         qr/^http:\/\//i, # generally the worst transport
 106 );
 107
 108 sub getopts {
 109         Getopt::Long::Configure("bundling", "no_permute");
 110         my $result=GetOptions(
 111                 "q|quiet" => \$quiet,
 112                 "n|noact" => \$noact,
 113                 "a|auth", => \$want_auth,
 114         );
 115         if (! $result || @ARGV < 1) {
 116                 die "usage: webcheckout [options] url [destdir]\n";
 117         }
 118
 119         $url=shift @ARGV;
 120         $destdir=shift @ARGV;
 121
 122         eval q{use URI::Heuristic};
 123         if (! $@) {
 124                 $url=URI::Heuristic::uf_uristr($url);
 125         }
 126
 127         if ($noact) {
 128                 $quiet=0;
 129         }
 130 }
 131
 132 sub doit {
 133         my @args=grep { defined } @_;
 134         print join(" ", @args)."\n" unless $quiet;
 135         return 0 if $noact;
 136         return system(@args);
 137 }
 138
 139 # Is repo a better than repo b?
 140 sub better {
 141         my ($a, $b)=@_;
 142
 143         my @anon;
 144         foreach my $r (@anon_urls) {
 145                 if ($a->{href} =~ /$r/) {
 146                         push @anon, $a;
 147                 }
 148                 elsif ($b->{href} =~ /$r/) {
 149                         push @anon, $b;
 150                 }
 151         }
 152
 153         if ($want_auth) {
 154                 # Whichever is authed is better.
 155                 return 1 if ! @anon || ! grep { $_ eq $a } @anon;
 156                 return 0 if ! grep { $_ eq $b } @anon;
 157                 # Neither is authed, so the better anon method wins.
 158                 return $anon[0] == $a;
 159         }
 160         else {
 161                 # Better anon method wins.
 162                 return @anon && $anon[0] == $a;
 163         }
 164 }
 165
 166 # Eliminate duplicate repositories from list.
 167 # Duplicate repositories have the same title, or the same href.
 168 sub dedup {
 169         my %seenhref;
 170         my %bytitle;
 171         my @others;
 172         foreach my $repo (@_) {
 173                 if (exists $repo->{title} &&
 174                     length $repo->{title}) {
 175                         if (exists $bytitle{$repo->{title}}) {
 176                                 my $other=$bytitle{$repo->{title}};
 177                                 next unless better($repo, $other);
 178                                 delete $bytitle{$other->{title}}
 179                         }
 180
 181                         if (! $seenhref{$repo->{href}}++) {
 182                                 $bytitle{$repo->{title}}=$repo;
 183                         }
 184                 }
 185                 else {
 186                         push @others, $repo;
 187                 }
 188         }
 189
 190         return values %bytitle, @others;
 191 }
 192
 193 sub parse {
 194         my $page=shift;
 195
 196         my @ret;
 197         my $parser=HTML::Parser->new(api_version => 3);
 198         my $abody=undef;
 199         my $aref=undef;
 200         $parser->handler(start => sub {
 201                 my $tagname=shift;
 202                 my $attr=shift;
 203
 204                 return if ! exists $attr->{href} || ! length $attr->{href};
 205                 return if ! exists $attr->{rel} || $attr->{rel} !~ /^vcs-(.+)/i;
 206                 $attr->{type}=lc($1);
 207
 208                 # need to collect the body of the <a> tag if there is no title
 209                 if ($tagname eq "a" && ! exists $attr->{title}) {
 210                         $abody="";
 211                         $aref=$attr;
 212                 }
 213
 214                 push @ret, $attr;
 215         }, "tagname, attr");
 216         $parser->handler(text => sub {
 217                 if (defined $aref) {
 218                         $abody.=join(" ", @_);
 219                 }
 220         }, "text");
 221         $parser->handler(end => sub {
 222                 my $tagname=shift;
 223                 if ($tagname eq "a" && defined $aref) {
 224                         $aref->{title}=$abody;
 225                         $aref=undef;
 226                         $abody=undef;
 227                 }
 228         }, "tagname");
 229         $parser->report_tags(qw{link a});
 230         $parser->parse($page);
 231         $parser->eof;
 232
 233         return @ret;
 234 }
 235
 236 getopts();
 237
 238 my $page=get($url);
 239 if (! defined $page) {
 240         die "failed to download $url\n";
 241 }
 242
 243 my @repos=dedup(parse($page));
 244 if (! @repos) {
 245         die "no repositories found on $url\n";
 246 }
 247
 248 #use Data::Dumper;
 249 #print Dumper(\@repos);
 250 #exit;
 251
 252 if (defined $destdir && @repos > 1) {
 253         # create subdirs of $destdir for the multiple repos
 254         if (! $noact) {
 255                 mkdir($destdir);
 256                 chdir($destdir) || die "failed to chdir to $destdir: $!";
 257         }
 258         $destdir=undef;
 259 }
 260
 261 my $errors=0;
 262 foreach my $repo (@repos) {
 263         my $handler=$handlers{$repo->{type}};
 264         if ($handler) {
 265                 if ($handler->($repo->{href}) != 0) {
 266                         print STDERR "failed to checkout ".$repo->{href}."\n";
 267                         $errors++;
 268                 }
 269         }
 270         else {
 271                 print STDERR "unknown repository type ".$repo->{type}.
 272                         " for ".$repo->{href}."\n";
 273                 $errors++;
 274         }
 275 }
 276 exit($errors > 0);