Scraping now updates last_scraped_at and sets is_conversation_scraped

This commit is contained in:
Alessio 2021-12-20 15:27:59 -05:00
parent c1bcd54a11
commit 8e13e30ac5
2 changed files with 20 additions and 1 deletions

View File

@ -94,7 +94,9 @@ test $(find videos | wc -l) = "$((initial_videos_count + 1))"
# Download a full thread
tw fetch_tweet https://twitter.com/RememberAfghan1/status/1429585423702052867
test $(sqlite3 twitter.db "select handle from tweets join users on tweets.user_id = users.id where tweets.id=1429585423702052867") = "RememberAfghan1"
test $(sqlite3 twitter.db "select is_conversation_scraped, abs(last_scraped_at - strftime('%s','now')) < 30 from tweets where id = 1429585423702052867") = "1|1"
test $(sqlite3 twitter.db "select handle from tweets join users on tweets.user_id = users.id where tweets.id=1429584239570391042") = "michaelmalice"
test $(sqlite3 twitter.db "select is_conversation_scraped from tweets where id = 1429584239570391042") = "0"
# test $(sqlite3 twitter.db "select handle from tweets join users on tweets.user_id = users.id where tweets.id=1429583672827465730") = "kanesays23" TODO: this guy got banned
test $(sqlite3 twitter.db "select handle from tweets join users on tweets.user_id = users.id where tweets.id=1429616911315345414") = "NovaValentis"
test $(sqlite3 twitter.db "select reply_mentions from tweets where id = 1429585423702052867") = "michaelmalice"

View File

@ -163,9 +163,11 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
ret.Polls = []Poll{poll}
}
// Process tombstones
// Process tombstones and other metadata
ret.TombstoneType = apiTweet.TombstoneText
ret.IsStub = !(ret.TombstoneType == "")
ret.LastScrapedAt = time.Unix(0, 0) // Caller will change this for the tweet that was actually scraped
ret.IsConversationScraped = false // Safe due to the "No Worsening" principle
return
}
@ -200,6 +202,9 @@ func GetTweet(id TweetID) (Tweet, error) {
* Return a list of tweets, including the original and the rest of its thread,
* along with a list of associated users.
*
* Mark the main tweet as "is_conversation_downloaded = true", and update its "last_scraped_at"
* value.
*
* args:
* - id: the ID of the tweet to get
*
@ -230,6 +235,18 @@ func GetTweetFull(id TweetID) (tweets []Tweet, retweets []Retweet, users []User,
users = append(users, fetched_user)
}
tweets, retweets, _users, err := ParseTweetResponse(tweet_response)
// Find the main tweet and update its "is_conversation_downloaded" and "last_scraped_at"
scrape_time := time.Now()
for i, t := range(tweets) {
fmt.Printf("Checking tweet %d (%v)\n", t.ID, t.LastScrapedAt)
if t.ID == id {
// Index the slice because `tweets[i]` is a reference, whereas `t` is a copy
tweets[i].LastScrapedAt = scrape_time
tweets[i].IsConversationScraped = true
fmt.Printf("Updating tweet %d: %v\n", tweets[i].ID, tweets[i].LastScrapedAt.Unix())
}
}
users = append(users, _users...)
return
}