From a015a321a3e52511881358ba71abb96b1205e983 Mon Sep 17 00:00:00 2001 From: Alex Huddleston Date: Tue, 12 Feb 2019 22:17:13 -0600 Subject: [PATCH] proof of concept for tweet downloading. --- .gitignore | 5 ++- tweet_downloader.py | 81 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 tweet_downloader.py diff --git a/.gitignore b/.gitignore index d7b9f96..528815a 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,4 @@ -.pyenv/* \ No newline at end of file +.pyenv/* +keys* +users* +*_tweets* \ No newline at end of file diff --git a/tweet_downloader.py b/tweet_downloader.py new file mode 100644 index 0000000..adb749c --- /dev/null +++ b/tweet_downloader.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +# encoding: utf-8 + +# Some legacy code I wrote for a project in college. It's ugly and messy. I'll clean it up and repurpose it later. + +import tweepy #https://github.com/tweepy/tweepy +import csv +from time import sleep + + +def get_all_tweets(screen_name): + #Twitter only allows access to a users most recent 3240 tweets with this method + with open('keys.txt', 'r') as f: + #Twitter API credential + consumer_key = f.readline().rstrip() + consumer_secret = f.readline().rstrip() + access_key = f.readline().rstrip() + access_secret = f.readline().rstrip() + + #authorize twitter, initialize tweepy + auth = tweepy.OAuthHandler(consumer_key, consumer_secret) + auth.set_access_token(access_key, access_secret) + api = tweepy.API(auth) + + #initialize a list to hold all the tweepy Tweets + alltweets = [] + + #make initial request for most recent tweets (200 is the maximum allowed count) + new_tweets = api.user_timeline(screen_name = screen_name,count=200,tweet_mode = 'extended') + + #save most recent tweets + alltweets.extend(new_tweets) + + #save the id of the oldest tweet less one + oldest = alltweets[-1].id - 1 + + #keep grabbing tweets until there are no tweets left to grab + while len(new_tweets) > 0 and len(alltweets) < 3000: + print("getting tweets before %s" % (oldest)) + + #all subsiquent requests use the max_id param to prevent duplicates + new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest) + + #save most recent tweets + alltweets.extend(new_tweets) + + #update the id of the oldest tweet less one + oldest = alltweets[-1].id - 1 + + print("...%s tweets downloaded so far" % (len(alltweets))) + + #transform the tweepy tweets into a 2D array that will populate the csv + outtweets = [] + for tweet in alltweets: + try: + if tweet.retweeted or ('RT @' in tweet.text): + outtweets.append([tweet.id_str, tweet.created_at, "True", tweet.retweeted_status.text.encode("utf-8"), tweet.entities.get('hashtags'), tweet.entities.get('user_mentions')]) + except: + try: + outtweets.append([tweet.id_str, tweet.created_at, ('RT @' in tweet.full_text), tweet.full_text.encode("utf-8"), tweet.entities.get('hashtags'), tweet.entities.get('user_mentions')]) + except: + outtweets.append([tweet.id_str, tweet.created_at, ('RT @' in tweet.text), tweet.text.encode("utf-8"), tweet.entities.get('hashtags'), tweet.entities.get('user_mentions')]) + + #write the csv + with open('%s_tweets.csv' % screen_name, 'w') as f: + writer = csv.writer(f) + writer.writerow(["id","created_at","retweet","text","hashtags","user_mentions"]) + writer.writerows(outtweets) + + pass + + +if __name__ == '__main__': + #pass in the username of the account you want to download + counter = 0 + with open("users.txt", "r") as f: + for line in f: + get_all_tweets(line.rstrip()) + counter = counter + 1 + if (counter % 5) == 0: + sleep(15*60)