X-Git-Url: https://git.madduck.net/code/myrepos.git/blobdiff_plain/af9a9d12caffaccb04e0e989a5b71e18370953f4..d8426f37c88c5d980aa93855e40b50c97c282590:/webcheckout?ds=inline

diff --git a/webcheckout b/webcheckout
index 33bccb9..a6ae94f 100755
--- a/webcheckout
+++ b/webcheckout
@@ -1,14 +1,76 @@
 #!/usr/bin/perl
+
+=head1 NAME
+
+webcheckout - check out repositories referenced on a web page
+
+=head1 SYNOPSIS
+
+B<webcheckout> [options] url [destdir]
+
+=head1 DESCRIPTION
+
+B<webcheckout> downloads an url and parses it, looking for version control 
+repositories referenced by the page. It checks out each repository into
+a subdirectory of the current directory, using whatever VCS program is
+appropriate for that repository (git, svn, etc).
+
+The information about the repositories is embedded in the web page using
+the rel=vcs microformat, which is documented at
+<http://kitenet.net/~joey/rfc/rel-vcs/>.
+
+If the optional destdir parameter is specified, VCS programs will be asked
+to check out repositories into that directory. If there are multiple
+repositories to check out, each will be checked out into a separate
+subdirectory of the destdir.
+
+=head1 OPTIONS
+
+=over 4
+
+=item -a, --auth
+
+Prefer authenticated repositories. By default, webcheckout will use
+anonymous repositories when possible. If you have an account that
+allows you to use authenticated repositories, you might want to use this
+option.
+
+=item -n
+
+Do not actually check anything out, just print out the commands that would
+be run to check out the repositories.
+
+=item -q
+
+Quiet mode. Do not print out the commands being run. (The VCS commands
+may still be noisy however.)
+
+=back
+
+=head1 AUTHOR
+
+Copyright 2009 Joey Hess <joey@kitenet.net>
+
+Licensed under the GNU GPL version 2 or higher.
+
+This program is included in mr <http://kitenet.net/~joey/code/mr/>
+
+=cut
+
 use LWP::Simple;
 use HTML::Parser;
+use Getopt::Long;
 use warnings;
 use strict;
 
+# What to download.
+my $url;
+
 # Controls whether to print what is being done.
-my $verbose=1;
+my $quiet=0;
 
 # Controls whether to actually check anything out.
-my $noact=1;
+my $noact=0;
 
 # Controls whether to perfer repos that use authentication.
 my $want_auth=0;
@@ -35,9 +97,28 @@ my @anon_urls=(
 	qr/^http:\/\//i, # generally the worst transport
 );
 
+sub getopts {
+	Getopt::Long::Configure("bundling", "no_permute");
+	my $result=GetOptions(
+		"q|quiet" => \$quiet,
+		"n|noact" => \$noact,
+		"a|auth", => \$want_auth,
+	);
+	if (! $result || @ARGV < 1) {
+		die "usage: webcheckout [options] url [destdir]\n";
+	}
+
+	$url=shift @ARGV;
+	$destdir=shift @ARGV;
+
+	if ($noact) {
+		$quiet=0;
+	}
+}
+
 sub doit {
 	my @args=grep { defined } @_;
-	print join(" ", @args)."\n" if $verbose;
+	print join(" ", @args)."\n" unless $quiet;
 	return 0 if $noact;
 	return system(@args);
 }
@@ -46,23 +127,26 @@ sub doit {
 sub better {
 	my ($a, $b)=@_;
 
-	my $firstanon=$b;
+	my @anon;
 	foreach my $r (@anon_urls) {
 		if ($a->{href} =~ /$r/) {
-			$firstanon=$a;
-			last;
+			push @anon, $a;
 		}
 		elsif ($b->{href} =~ /$r/) {
-			$firstanon=$b;
-			last;
+			push @anon, $b;
 		}
 	}
 
 	if ($want_auth) {
-		return $firstanon != $a;
+		# Whichever is authed is better.
+		return 1 if ! @anon || ! grep { $_ eq $a } @anon;
+		return 0 if ! grep { $_ eq $b } @anon;
+		# Neither is authed, so the better anon method wins.
+		return $anon[0] == $a;
 	}
 	else {
-		return $firstanon == $a;
+		# Better anon method wins.
+		return @anon && $anon[0] == $a;
 	}
 }
 
@@ -71,21 +155,26 @@ sub better {
 sub dedup {
 	my %seenhref;
 	my %bytitle;
+	my @others;
 	foreach my $repo (@_) {
 		if (exists $repo->{title} &&
-		    length $repo->{title} &&
-		    exists $bytitle{$repo->{title}}) {
-			my $other=$bytitle{$repo->{title}};
-			next unless better($repo, $other);
-			delete $bytitle{$other->{title}}
-		}
+		    length $repo->{title}) {
+		 	if (exists $bytitle{$repo->{title}}) {
+				my $other=$bytitle{$repo->{title}};
+				next unless better($repo, $other);
+				delete $bytitle{$other->{title}}
+			}
 
-		if (! $seenhref{$repo->{href}}++) {
-			$bytitle{$repo->{title}}=$repo;
+			if (! $seenhref{$repo->{href}}++) {
+				$bytitle{$repo->{title}}=$repo;
+			}
+		}
+		else {
+			push @others, $repo;
 		}
 	}
 
-	return values %bytitle;
+	return values %bytitle, @others;
 }
 
 sub parse {
@@ -93,25 +182,44 @@ sub parse {
 
 	my @ret;
 	my $parser=HTML::Parser->new(api_version => 3);
+	my $abody=undef;
+	my $aref=undef;
 	$parser->handler(start => sub {
 		my $tagname=shift;
 		my $attr=shift;
-		return if lc $tagname ne 'link';
 		return if ! exists $attr->{rel} || lc $attr->{rel} ne 'vcs';
 		return if ! exists $attr->{href} || ! length $attr->{href};
 		return if ! exists $attr->{type} || ! length $attr->{type};
+
+		# need to collect the body of the <a> tag if there is no title
+		if ($tagname eq "a" && ! exists $attr->{title}) {
+			$abody="";
+			$aref=$attr;
+		}
+
 		push @ret, $attr;
 	}, "tagname, attr");
+	$parser->handler(text => sub {
+		if (defined $aref) {
+			$abody.=join(" ", @_);
+		}
+	}, "text");
+	$parser->handler(end => sub {
+		my $tagname=shift;
+		if ($tagname eq "a" && defined $aref) {
+			$aref->{title}=$abody;
+			$aref=undef;
+			$abody=undef;
+		}
+	}, "tagname");
+	$parser->report_tags(qw{link a});
 	$parser->parse($page);
 	$parser->eof;
 
 	return @ret;
 }
 
-my $url=shift;
-if (! defined $url) {
-	die "usage: webcheckout url\n";
-}
+getopts();
 
 my $page=get($url);
 if (! defined $page) {
@@ -123,6 +231,19 @@ if (! @repos) {
 	die "no repositories found on $url\n";
 }
 
+#use Data::Dumper;
+#print Dumper(\@repos);
+#exit;
+
+if (defined $destdir && @repos > 1) {
+	# create subdirs of $destdir for the multiple repos
+	if (! $noact) {
+		mkdir($destdir);
+		chdir($destdir) || die "failed to chdir to $destdir: $!";
+	}
+	$destdir=undef;
+}
+
 my $errors=0;
 foreach my $repo (@repos) {
 	my $handler=$handlers{$repo->{type}};
@@ -139,6 +260,3 @@ foreach my $repo (@repos) {
 	}
 }
 exit($errors > 0);
-
-#use Data::Dumper;
-#print Dumper(\@repos);