From 8e13e30ac53f346de231f3cdd8cc969669309d59 Mon Sep 17 00:00:00 2001 From: Alessio Date: Mon, 20 Dec 2021 15:27:59 -0500 Subject: [PATCH] Scraping now updates last_scraped_at and sets is_conversation_scraped --- cmd/tests.sh | 2 ++ scraper/tweet.go | 19 ++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/cmd/tests.sh b/cmd/tests.sh index 98bbf0e..33c69ba 100755 --- a/cmd/tests.sh +++ b/cmd/tests.sh @@ -94,7 +94,9 @@ test $(find videos | wc -l) = "$((initial_videos_count + 1))" # Download a full thread tw fetch_tweet https://twitter.com/RememberAfghan1/status/1429585423702052867 test $(sqlite3 twitter.db "select handle from tweets join users on tweets.user_id = users.id where tweets.id=1429585423702052867") = "RememberAfghan1" +test $(sqlite3 twitter.db "select is_conversation_scraped, abs(last_scraped_at - strftime('%s','now')) < 30 from tweets where id = 1429585423702052867") = "1|1" test $(sqlite3 twitter.db "select handle from tweets join users on tweets.user_id = users.id where tweets.id=1429584239570391042") = "michaelmalice" +test $(sqlite3 twitter.db "select is_conversation_scraped from tweets where id = 1429584239570391042") = "0" # test $(sqlite3 twitter.db "select handle from tweets join users on tweets.user_id = users.id where tweets.id=1429583672827465730") = "kanesays23" TODO: this guy got banned test $(sqlite3 twitter.db "select handle from tweets join users on tweets.user_id = users.id where tweets.id=1429616911315345414") = "NovaValentis" test $(sqlite3 twitter.db "select reply_mentions from tweets where id = 1429585423702052867") = "michaelmalice" diff --git a/scraper/tweet.go b/scraper/tweet.go index 74f21f2..d25733a 100644 --- a/scraper/tweet.go +++ b/scraper/tweet.go @@ -163,9 +163,11 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) { ret.Polls = []Poll{poll} } - // Process tombstones + // Process tombstones and other metadata ret.TombstoneType = apiTweet.TombstoneText ret.IsStub = !(ret.TombstoneType == "") + ret.LastScrapedAt = time.Unix(0, 0) // Caller will change this for the tweet that was actually scraped + ret.IsConversationScraped = false // Safe due to the "No Worsening" principle return } @@ -200,6 +202,9 @@ func GetTweet(id TweetID) (Tweet, error) { * Return a list of tweets, including the original and the rest of its thread, * along with a list of associated users. * + * Mark the main tweet as "is_conversation_downloaded = true", and update its "last_scraped_at" + * value. + * * args: * - id: the ID of the tweet to get * @@ -230,6 +235,18 @@ func GetTweetFull(id TweetID) (tweets []Tweet, retweets []Retweet, users []User, users = append(users, fetched_user) } tweets, retweets, _users, err := ParseTweetResponse(tweet_response) + + // Find the main tweet and update its "is_conversation_downloaded" and "last_scraped_at" + scrape_time := time.Now() + for i, t := range(tweets) { + fmt.Printf("Checking tweet %d (%v)\n", t.ID, t.LastScrapedAt) + if t.ID == id { + // Index the slice because `tweets[i]` is a reference, whereas `t` is a copy + tweets[i].LastScrapedAt = scrape_time + tweets[i].IsConversationScraped = true + fmt.Printf("Updating tweet %d: %v\n", tweets[i].ID, tweets[i].LastScrapedAt.Unix()) + } + } users = append(users, _users...) return }