From: martin f. krafft Date: Sat, 10 Aug 2019 01:21:39 +0000 (+0200) Subject: initial checkin X-Git-Url: https://git.madduck.net/code/twitter-archiver.git/commitdiff_plain/849c48fb1ae4b3d94248a24a419b7ce519c52715?pf=code initial checkin --- 849c48fb1ae4b3d94248a24a419b7ce519c52715 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..194a4fc --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +authdata.py +/__pycache__ diff --git a/README.md b/README.md new file mode 100644 index 0000000..12188d6 --- /dev/null +++ b/README.md @@ -0,0 +1,30 @@ +# Archiving Twitter + +These are a set of scripts allowing you to keep a copy of your Twitter life: + +1. Dumping followers, optionally sending changes to you by e-mail at regular + intervals (using cron). + +2. Archiving all your tweets as they come in, both their raw data, as well as + an HTML dump the way Twitter would dump it at the time. + +All of this is very hackish, but if it's of any use to you, then great! + +## Patches + +Patches welcome, please send them to madduck@madduck.net using +git-format-patch and git-send-email. + +## Wishlist + +0. Setup instructions +1. Dumping favourites/starred tweets +2. Dumping bookmarks +3. Exporting parameters (such as email address) to configuration + +## Copyright + +Copyright © 2017–2019 by martin f. krafft +and released under the terms of the Artistic Licence 2.0. + +If you need it under another licence, let me know. diff --git a/authdata.py.sample b/authdata.py.sample new file mode 100644 index 0000000..d2aa133 --- /dev/null +++ b/authdata.py.sample @@ -0,0 +1,4 @@ +consumer_key = '…' +consumer_secret = '…' +access_token = '…' +access_secret = '…' diff --git a/fetch_tweets.sh b/fetch_tweets.sh new file mode 100755 index 0000000..0ce8ff2 --- /dev/null +++ b/fetch_tweets.sh @@ -0,0 +1,19 @@ +#!/bin/sh +# +# fetchtweets.sh +# +# Convenience wrapper for tweetfetch.py +# +# Using ./tweets/.sentinel, it iteratively obtains new tweets since the last +# run, and can thus be run regularly from cron without arguments. +# +# Copyright © 2017–2019 by martin f. krafft +# Released under the Artistic Licence 2.0 +# + +MYDIR="${0%/*}" +cd "$MYDIR" + +export LC_ALL=$(locale -a | grep utf8 | head -1) + +./tweetfetch.py $(cat tweets/.sentinel) diff --git a/followerdump.py b/followerdump.py new file mode 100755 index 0000000..20ba366 --- /dev/null +++ b/followerdump.py @@ -0,0 +1,45 @@ +#!/usr/bin/python3 +# +# followerdump.py +# +# Dumps the list of your Twitter followers to stdout in the format +# +# username displayname UID +# +# Copyright © 2017–2019 by martin f. krafft +# Released under the Artistic Licence 2.0 +# + +from authdata import * + +import twython +import itertools +import time +import sys + +twitter = twython.Twython(app_key=consumer_key, + app_secret=consumer_secret, + oauth_token=access_token, + oauth_token_secret=access_secret) + +def grouper(iterable, n, fillvalue=None): + "Collect data into fixed-length chunks or blocks" + args = [iter(iterable)] * n + return itertools.zip_longest(fillvalue=fillvalue, *args) + +i=0 +followers = twitter.cursor(twitter.get_followers_ids, + count=5000, stringify_ids=True) + +#print('Obtained followers…', file=sys.stderr) + +for chunk in grouper(followers, 100): + chunk = [c for c in chunk if c] + #print(' fetching user data for chunk of {0:d} users…'.format(len(chunk)), + # file=sys.stderr) + n = 0 + for follower in twitter.lookup_user(user_id=','.join(chunk)): + #print(' [{0:02d}] @{1:s}'.format(n, follower['screen_name']), + # file=sys.stderr) + n += 1 + print('\t'.join([follower[i] for i in ('screen_name','name','id_str')])) diff --git a/followers b/followers new file mode 160000 index 0000000..69cf3e8 --- /dev/null +++ b/followers @@ -0,0 +1 @@ +Subproject commit 69cf3e8a9cd84b5894c6c9528ba14b588e3daff9 diff --git a/report_followers.sh b/report_followers.sh new file mode 100755 index 0000000..0ef7e04 --- /dev/null +++ b/report_followers.sh @@ -0,0 +1,40 @@ +#!/bin/sh +# +# reportfollowers.sh +# +# Send a diff of the follower dumps stored under ./followers per mail, +# designed to be run daily from cron. +# +# The optional argument overrides the timestamp to compare against, which +# defaults to "yesterday" +# +# Copyright © 2017–2019 by martin f. krafft +# Released under the Artistic Licence 2.0 +# + +set -eu + +MYDIR="${0%/*}" +cd "$MYDIR"/followers + +export LC_ALL=$(locale -a | grep utf8 | head -1) + +REVSPEC="${1:-master@{yesterday\}}" +DATESTAMP="$(git show --pretty=format:'%ai (%ar)' --no-patch $REVSPEC)" +COUNT=$(wc -l dump | cut -d' ' -f1) +STATS="$(git diff --numstat $REVSPEC | \ + sed -re 's,([[:digit:]]+)[[:space:]]+([[:digit:]]+).*,+\1/-\2,')" + +SENDMAIL="/usr/sbin/sendmail madduck@madduck.net" +[ ! -t 0 ] || SENDMAIL=cat + +$SENDMAIL <<_eof +From: Twitter follower report +Subject: $(wc -l dump | cut -d' ' -f1) followers ($STATS) + +Changes since $DATESTAMP: + +$(git diff $REVSPEC | grep '^[-+][^-+]' | sort -k1.1,1.1) + +=Total followers: $COUNT +_eof diff --git a/track_followers.sh b/track_followers.sh new file mode 100755 index 0000000..f207903 --- /dev/null +++ b/track_followers.sh @@ -0,0 +1,26 @@ +#!/bin/sh +# +# track_followers.sh +# +# Convenience wrapper for followerdump.py +# +# Obtains the current list of followers and commits them to Git +# +# Copyright © 2017–2019 by martin f. krafft +# Released under the Artistic Licence 2.0 +# + +MYDIR="${0%/*}" +cd "$MYDIR" + +export LC_ALL=$(locale -a | grep utf8 | head -1) + +./followerdump.py | sort > followers/dump +cd followers +[ -s dump ] || exit 0 +COUNT=$(wc -l dump | cut -d' ' -f1) +MSG="Twitter follower set update, total=$COUNT" +git add dump +if git commit -m"$MSG" >&2; then + git show HEAD +fi diff --git a/tweetfetch.py b/tweetfetch.py new file mode 100755 index 0000000..7c7e67b --- /dev/null +++ b/tweetfetch.py @@ -0,0 +1,61 @@ +#!/usr/bin/python3 +# +# tweetfetch.py +# +# Fetches tweets since a given timestamp and stores data in JSON files, as +# well as an HTML dump for each in the subdirectory ./tweets. +# +# Usage: ./tweetfetch.py 1159606554041040896 +# +# The timestamp of the last tweet fetched is stored in ./tweets/.sentinel +# +# Copyright © 2017–2019 by martin f. krafft +# Released under the Artistic Licence 2.0 +# + +from authdata import * + +from twython import Twython +import json +import sys + +twitter = Twython(app_key=consumer_key, + app_secret=consumer_secret, + oauth_token=access_token, + oauth_token_secret=access_secret, + oauth_version=1) + +config = {'include_rts': False, + 'count': 200, + 'trim_user': True, + 'exclude_replies': True, + } + +if len(sys.argv) > 1: + config['since_id'] = sys.argv[1] + print("Limiting results to tweets since ID {}".format(config['since_id']), + file=sys.stderr) + +user_timeline = twitter.get_user_timeline(screen_name="martinkrafft", + **config) + +max_id = int(config.get('since_id', 0)) + +print("Fetched {} tweets, writing them to disk…".format(len(user_timeline)), + file=sys.stderr) + +for tweet in user_timeline: + with open("tweets/{}.json".format(tweet['id_str']), "wt") as tf: + print(json.dumps(tweet), file=tf) + + with open("tweets/{}.html".format(tweet['id_str']), "wt") as tf: + print(Twython.html_for_tweet(tweet, use_expanded_url=True), file=tf) + + print(" wrote tweet ID {}".format(tweet['id_str']), + file=sys.stderr) + + max_id = max(tweet['id'], max_id) + +print("Writing ID {} to sentinel file…".format(max_id), file=sys.stderr) +with open("tweets/.sentinel", "wt") as tf: + print('{0:d}'.format(max_id), file=tf) diff --git a/tweets/.gitignore b/tweets/.gitignore new file mode 100644 index 0000000..a68d087 --- /dev/null +++ b/tweets/.gitignore @@ -0,0 +1,2 @@ +/* +!/.gitignore