From: martin f. krafft <madduck@madduck.net>
Date: Sat, 10 Aug 2019 01:21:39 +0000 (+0200)
Subject: initial checkin
X-Git-Url: https://git.madduck.net/code/twitter-archiver.git/commitdiff_plain/849c48fb1ae4b3d94248a24a419b7ce519c52715?pf=code

initial checkin
---

849c48fb1ae4b3d94248a24a419b7ce519c52715
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..194a4fc
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+authdata.py
+/__pycache__
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..12188d6
--- /dev/null
+++ b/README.md
@@ -0,0 +1,30 @@
+# Archiving Twitter
+
+These are a set of scripts allowing you to keep a copy of your Twitter life:
+
+1. Dumping followers, optionally sending changes to you by e-mail at regular
+   intervals (using cron).
+
+2. Archiving all your tweets as they come in, both their raw data, as well as
+   an HTML dump the way Twitter would dump it at the time.
+
+All of this is very hackish, but if it's of any use to you, then great!
+
+## Patches
+
+Patches welcome, please send them to madduck@madduck.net using
+git-format-patch and git-send-email.
+
+## Wishlist
+
+0. Setup instructions
+1. Dumping favourites/starred tweets
+2. Dumping bookmarks
+3. Exporting parameters (such as email address) to configuration
+
+## Copyright
+
+Copyright Â© 2017â2019 by martin f. krafft <madduck@madduck.net>
+and released under the terms of the Artistic Licence 2.0.
+
+If you need it under another licence, let me know.
diff --git a/authdata.py.sample b/authdata.py.sample
new file mode 100644
index 0000000..d2aa133
--- /dev/null
+++ b/authdata.py.sample
@@ -0,0 +1,4 @@
+consumer_key = 'â¦'
+consumer_secret = 'â¦'
+access_token = 'â¦'
+access_secret = 'â¦'
diff --git a/fetch_tweets.sh b/fetch_tweets.sh
new file mode 100755
index 0000000..0ce8ff2
--- /dev/null
+++ b/fetch_tweets.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+# fetchtweets.sh
+#
+# Convenience wrapper for tweetfetch.py
+#
+# Using ./tweets/.sentinel, it iteratively obtains new tweets since the last
+# run, and can thus be run regularly from cron without arguments.
+#
+# Copyright Â© 2017â2019 by martin f. krafft <madduck@madduck.net>
+# Released under the Artistic Licence 2.0
+#
+
+MYDIR="${0%/*}"
+cd "$MYDIR"
+
+export LC_ALL=$(locale -a | grep utf8 | head -1)
+
+./tweetfetch.py $(cat tweets/.sentinel)
diff --git a/followerdump.py b/followerdump.py
new file mode 100755
index 0000000..20ba366
--- /dev/null
+++ b/followerdump.py
@@ -0,0 +1,45 @@
+#!/usr/bin/python3
+#
+# followerdump.py
+#
+# Dumps the list of your Twitter followers to stdout in the format
+#
+#   username <tab> displayname <tab> UID
+#
+# Copyright Â© 2017â2019 by martin f. krafft <madduck@madduck.net>
+# Released under the Artistic Licence 2.0
+#
+
+from authdata import *
+
+import twython
+import itertools
+import time
+import sys
+
+twitter = twython.Twython(app_key=consumer_key,
+        app_secret=consumer_secret,
+        oauth_token=access_token,
+        oauth_token_secret=access_secret)
+
+def grouper(iterable, n, fillvalue=None):
+    "Collect data into fixed-length chunks or blocks"
+    args = [iter(iterable)] * n
+    return itertools.zip_longest(fillvalue=fillvalue, *args)
+
+i=0
+followers = twitter.cursor(twitter.get_followers_ids,
+        count=5000, stringify_ids=True)
+
+#print('Obtained followersâ¦', file=sys.stderr)
+
+for chunk in grouper(followers, 100):
+    chunk = [c for c in chunk if c]
+    #print('  fetching user data for chunk of {0:d} usersâ¦'.format(len(chunk)),
+    #    file=sys.stderr)
+    n = 0
+    for follower in twitter.lookup_user(user_id=','.join(chunk)):
+        #print('    [{0:02d}] @{1:s}'.format(n, follower['screen_name']),
+        #        file=sys.stderr)
+        n += 1
+        print('\t'.join([follower[i] for i in ('screen_name','name','id_str')]))
diff --git a/followers b/followers
new file mode 160000
index 0000000..69cf3e8
--- /dev/null
+++ b/followers
@@ -0,0 +1 @@
+Subproject commit 69cf3e8a9cd84b5894c6c9528ba14b588e3daff9
diff --git a/report_followers.sh b/report_followers.sh
new file mode 100755
index 0000000..0ef7e04
--- /dev/null
+++ b/report_followers.sh
@@ -0,0 +1,40 @@
+#!/bin/sh
+#
+# reportfollowers.sh
+#
+# Send a diff of the follower dumps stored under ./followers per mail,
+# designed to be run daily from cron.
+#
+# The optional argument overrides the timestamp to compare against, which
+# defaults to "yesterday"
+#
+# Copyright Â© 2017â2019 by martin f. krafft <madduck@madduck.net>
+# Released under the Artistic Licence 2.0
+#
+
+set -eu
+
+MYDIR="${0%/*}"
+cd "$MYDIR"/followers
+
+export LC_ALL=$(locale -a | grep utf8 | head -1)
+
+REVSPEC="${1:-master@{yesterday\}}"
+DATESTAMP="$(git show --pretty=format:'%ai (%ar)' --no-patch $REVSPEC)"
+COUNT=$(wc -l dump | cut -d' ' -f1)
+STATS="$(git diff --numstat $REVSPEC | \
+  sed -re 's,([[:digit:]]+)[[:space:]]+([[:digit:]]+).*,+\1/-\2,')"
+
+SENDMAIL="/usr/sbin/sendmail madduck@madduck.net"
+[ ! -t 0 ] || SENDMAIL=cat
+
+$SENDMAIL <<_eof
+From: Twitter follower report <madduck@madduck.net>
+Subject: $(wc -l dump | cut -d' ' -f1) followers ($STATS)
+
+Changes since $DATESTAMP:
+
+$(git diff $REVSPEC | grep '^[-+][^-+]' | sort -k1.1,1.1)
+
+=Total followers: $COUNT
+_eof
diff --git a/track_followers.sh b/track_followers.sh
new file mode 100755
index 0000000..f207903
--- /dev/null
+++ b/track_followers.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+#
+# track_followers.sh
+#
+# Convenience wrapper for followerdump.py
+#
+# Obtains the current list of followers and commits them to Git
+#
+# Copyright Â© 2017â2019 by martin f. krafft <madduck@madduck.net>
+# Released under the Artistic Licence 2.0
+#
+
+MYDIR="${0%/*}"
+cd "$MYDIR"
+
+export LC_ALL=$(locale -a | grep utf8 | head -1)
+
+./followerdump.py | sort > followers/dump
+cd followers
+[ -s dump ] || exit 0
+COUNT=$(wc -l dump | cut -d' ' -f1)
+MSG="Twitter follower set update, total=$COUNT"
+git add dump
+if git commit -m"$MSG" >&2; then
+  git show HEAD
+fi
diff --git a/tweetfetch.py b/tweetfetch.py
new file mode 100755
index 0000000..7c7e67b
--- /dev/null
+++ b/tweetfetch.py
@@ -0,0 +1,61 @@
+#!/usr/bin/python3
+#
+# tweetfetch.py
+#
+# Fetches tweets since a given timestamp and stores data in JSON files, as
+# well as an HTML dump for each in the subdirectory ./tweets.
+#
+# Usage: ./tweetfetch.py 1159606554041040896
+#
+# The timestamp of the last tweet fetched is stored in ./tweets/.sentinel
+#
+# Copyright Â© 2017â2019 by martin f. krafft <madduck@madduck.net>
+# Released under the Artistic Licence 2.0
+#
+
+from authdata import *
+
+from twython import Twython
+import json
+import sys
+
+twitter = Twython(app_key=consumer_key,
+                  app_secret=consumer_secret,
+                  oauth_token=access_token,
+                  oauth_token_secret=access_secret,
+                  oauth_version=1)
+
+config = {'include_rts': False,
+          'count': 200,
+          'trim_user': True,
+          'exclude_replies': True,
+         }
+
+if len(sys.argv) > 1:
+    config['since_id'] = sys.argv[1]
+    print("Limiting results to tweets since ID {}".format(config['since_id']),
+            file=sys.stderr)
+
+user_timeline = twitter.get_user_timeline(screen_name="martinkrafft",
+        **config)
+
+max_id = int(config.get('since_id', 0))
+
+print("Fetched {} tweets, writing them to diskâ¦".format(len(user_timeline)),
+        file=sys.stderr)
+
+for tweet in user_timeline:
+    with open("tweets/{}.json".format(tweet['id_str']), "wt") as tf:
+        print(json.dumps(tweet), file=tf)
+
+    with open("tweets/{}.html".format(tweet['id_str']), "wt") as tf:
+        print(Twython.html_for_tweet(tweet, use_expanded_url=True), file=tf)
+
+    print("  wrote tweet ID {}".format(tweet['id_str']),
+            file=sys.stderr)
+
+    max_id = max(tweet['id'], max_id)
+
+print("Writing ID {} to sentinel fileâ¦".format(max_id), file=sys.stderr)
+with open("tweets/.sentinel", "wt") as tf:
+    print('{0:d}'.format(max_id), file=tf)
diff --git a/tweets/.gitignore b/tweets/.gitignore
new file mode 100644
index 0000000..a68d087
--- /dev/null
+++ b/tweets/.gitignore
@@ -0,0 +1,2 @@
+/*
+!/.gitignore