Add downloading for media in tweets

This commit is contained in:
Alessio 2021-08-07 16:51:38 -07:00
parent d1ba8b48f3
commit 9734c09426
6 changed files with 189 additions and 7 deletions

View File

@ -0,0 +1,104 @@
package persistence
import (
"fmt"
"os"
"path"
"net/http"
"io/ioutil"
"offline_twitter/scraper"
)
type MediaDownloader interface {
Curl(url string, outpath string) error
}
type DefaultDownloader struct {}
/**
* Download a file over HTTP and save it.
*
* args:
* - url: the remote file to download
* - outpath: the path on disk to save it to
*/
func (d DefaultDownloader) Curl(url string, outpath string) error {
println(url)
resp, err := http.Get(url)
if err != nil {
return err
}
if resp.StatusCode != 200 {
return fmt.Errorf("Error %s: %s", url, resp.Status)
}
data, err := ioutil.ReadAll(resp.Body)
if err != nil {
return fmt.Errorf("Error downloading image %s: %s", url, err.Error())
}
err = os.WriteFile(outpath, data, 0644)
if err != nil {
return fmt.Errorf("Error writing to path: %s, url: %s: %s", outpath, url, err.Error())
}
return nil
}
/**
* Downloads an Image, and if successful, marks it as downloaded in the DB
*/
func (p Profile) download_tweet_image(img *scraper.Image, downloader MediaDownloader) error {
outfile := path.Join(p.ProfileDir, "images", img.LocalFilename)
err := downloader.Curl(img.RemoteURL, outfile)
if err != nil {
return err
}
img.IsDownloaded = true
return p.SaveImage(*img)
}
/**
* Downloads an Video, and if successful, marks it as downloaded in the DB
*/
func (p Profile) download_tweet_video(v *scraper.Video, downloader MediaDownloader) error {
outfile := path.Join(p.ProfileDir, "videos", v.LocalFilename)
err := downloader.Curl(v.RemoteURL, outfile)
if err != nil {
return err
}
v.IsDownloaded = true
return p.SaveVideo(*v)
}
/**
* Download a tweet's video and picture content.
*
* Wraps the `DownloadTweetContentWithInjector` method with the default (i.e., real) downloader.
*/
func (p Profile) DownloadTweetContentFor(t *scraper.Tweet) error {
return p.DownloadTweetContentWithInjector(t, DefaultDownloader{})
}
/**
* Enable injecting a custom MediaDownloader (i.e., for testing)
*/
func (p Profile) DownloadTweetContentWithInjector(t *scraper.Tweet, downloader MediaDownloader) error {
for i := range t.Images {
err := p.download_tweet_image(&t.Images[i], downloader)
if err != nil {
return err
}
}
for i := range t.Videos {
err := p.download_tweet_video(&t.Videos[i], downloader)
if err != nil {
return err
}
}
t.IsContentDownloaded = true
return p.SaveTweet(*t)
}

View File

@ -0,0 +1,72 @@
package persistence_test
import (
"testing"
"offline_twitter/scraper"
)
type FakeDownloader struct {}
func (d FakeDownloader) Curl(url string, outpath string) error { return nil }
func test_all_downloaded(tweet scraper.Tweet, yes_or_no bool, t *testing.T) {
error_msg := map[bool]string{
true: "Expected to be downloaded, but it wasn't",
false: "Expected not to be downloaded, but it was",
}[yes_or_no]
if len(tweet.Images) != 2 {
t.Errorf("Expected %d images, got %d", 2, len(tweet.Images))
}
if len(tweet.Videos) != 1 {
t.Errorf("Expected %d videos, got %d", 1, len(tweet.Videos))
}
for _, img := range tweet.Images {
if img.IsDownloaded != yes_or_no {
t.Errorf("%s: ImageID %d", error_msg, img.ID)
}
}
for _, vid := range tweet.Videos {
if vid.IsDownloaded != yes_or_no {
t.Errorf("Expected not to be downloaded, but it was: VideoID %d", vid.ID)
}
}
if tweet.IsContentDownloaded != yes_or_no {
t.Errorf("%s: the tweet", error_msg)
}
}
/**
* Create an Image, save it, reload it, and make sure it comes back the same
*/
func TestDownloadTweetContent(t *testing.T) {
profile_path := "test_profiles/TestMediaQueries"
profile := create_or_load_profile(profile_path)
tweet := create_dummy_tweet()
// Persist the tweet
err := profile.SaveTweet(tweet)
if err != nil {
t.Fatalf("Failed to save the tweet: %s", err.Error())
}
// Make sure everything is marked "not downloaded"
test_all_downloaded(tweet, false, t)
// Do the (fake) downloading
err = profile.DownloadTweetContentWithInjector(&tweet, FakeDownloader{})
if err != nil {
t.Fatalf("Error running fake download: %s", err.Error())
}
// It should all be marked "yes downloaded" now
test_all_downloaded(tweet, true, t)
// Reload the tweet (check db); should also be "yes downloaded"
new_tweet, err := profile.GetTweetById(tweet.ID)
if err != nil {
t.Fatalf("Couldn't reload the tweet: %s", err.Error())
}
test_all_downloaded(new_tweet, true, t)
}

View File

@ -17,16 +17,17 @@ func (p Profile) SaveTweet(t scraper.Tweet) error {
return err return err
} }
_, err = db.Exec(` _, err = db.Exec(`
insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, hashtags) insert into tweets (id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, hashtags, is_content_downloaded)
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
on conflict do update on conflict do update
set num_likes=?, set num_likes=?,
num_retweets=?, num_retweets=?,
num_replies=?, num_replies=?,
num_quote_tweets=? num_quote_tweets=?,
is_content_downloaded=?
`, `,
t.ID, t.UserID, t.Text, t.PostedAt.Unix(), t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.InReplyTo, t.QuotedTweet, scraper.JoinArrayOfHandles(t.Mentions), strings.Join(t.Hashtags, ","), t.ID, t.UserID, t.Text, t.PostedAt.Unix(), t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.InReplyTo, t.QuotedTweet, scraper.JoinArrayOfHandles(t.Mentions), strings.Join(t.Hashtags, ","), t.IsContentDownloaded,
t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.NumLikes, t.NumRetweets, t.NumReplies, t.NumQuoteTweets, t.IsContentDownloaded,
) )
if err != nil { if err != nil {
@ -106,7 +107,7 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
db := p.DB db := p.DB
stmt, err := db.Prepare(` stmt, err := db.Prepare(`
select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, hashtags select id, user_id, text, posted_at, num_likes, num_retweets, num_replies, num_quote_tweets, in_reply_to, quoted_tweet, mentions, hashtags, is_content_downloaded
from tweets from tweets
where id = ? where id = ?
`) `)
@ -122,7 +123,7 @@ func (p Profile) GetTweetById(id scraper.TweetID) (scraper.Tweet, error) {
var hashtags string var hashtags string
row := stmt.QueryRow(id) row := stmt.QueryRow(id)
err = row.Scan(&t.ID, &t.UserID, &t.Text, &postedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.InReplyTo, &t.QuotedTweet, &mentions, &hashtags) err = row.Scan(&t.ID, &t.UserID, &t.Text, &postedAt, &t.NumLikes, &t.NumRetweets, &t.NumReplies, &t.NumQuoteTweets, &t.InReplyTo, &t.QuotedTweet, &mentions, &hashtags, &t.IsContentDownloaded)
if err != nil { if err != nil {
return t, err return t, err
} }

View File

@ -15,6 +15,7 @@ func TestSaveAndLoadTweet(t *testing.T) {
profile := create_or_load_profile(profile_path) profile := create_or_load_profile(profile_path)
tweet := create_dummy_tweet() tweet := create_dummy_tweet()
tweet.IsContentDownloaded = true
// Save the tweet // Save the tweet
err := profile.SaveTweet(tweet) err := profile.SaveTweet(tweet)

View File

@ -29,6 +29,8 @@ type Tweet struct {
Mentions []UserHandle Mentions []UserHandle
Hashtags []string Hashtags []string
QuotedTweet TweetID QuotedTweet TweetID
IsContentDownloaded bool
} }

View File

@ -35,6 +35,8 @@ type User struct {
BannerImageUrl string BannerImageUrl string
PinnedTweetID TweetID PinnedTweetID TweetID
PinnedTweet *Tweet PinnedTweet *Tweet
IsContentDownloaded bool
} }
func (u User) String() string { func (u User) String() string {