314 lines
8.1 KiB
Go
Raw Normal View History

2021-05-22 18:20:18 -04:00
package scraper
import (
"fmt"
"strings"
2022-03-13 17:09:43 -07:00
"time"
"offline_twitter/terminal_utils"
2021-05-22 18:20:18 -04:00
)
2021-06-16 19:31:27 -07:00
const DEFAULT_MAX_REPLIES_EAGER_LOAD = 50
2021-05-22 18:20:18 -04:00
type TweetID int64
2021-05-22 18:20:18 -04:00
type Tweet struct {
2022-03-13 17:09:43 -07:00
ID TweetID
UserID UserID
UserHandle UserHandle // For processing tombstones
User *User
Text string
PostedAt Timestamp
NumLikes int
NumRetweets int
NumReplies int
NumQuoteTweets int
InReplyToID TweetID
QuotedTweetID TweetID
2021-05-22 18:20:18 -04:00
Images []Image
Videos []Video
Mentions []UserHandle
ReplyMentions []UserHandle
Hashtags []string
2021-12-12 16:42:32 -08:00
Urls []Url
Polls []Poll
Spaces []Space
2021-08-07 16:51:38 -07:00
2021-11-06 13:37:46 -07:00
TombstoneType string
2022-03-13 17:09:43 -07:00
IsStub bool
2021-11-06 13:37:46 -07:00
2022-03-13 17:09:43 -07:00
IsContentDownloaded bool
IsConversationScraped bool
2022-03-13 17:09:43 -07:00
LastScrapedAt Timestamp
2021-05-22 18:20:18 -04:00
}
func (t Tweet) String() string {
var author string
if t.User != nil {
author = fmt.Sprintf("%s\n@%s", t.User.DisplayName, t.User.Handle)
} else {
author = "@???"
}
ret := fmt.Sprintf(
2022-03-13 17:09:43 -07:00
`%s
%s
%s
Replies: %d RT: %d QT: %d Likes: %d
`,
author,
2022-03-06 18:09:43 -08:00
terminal_utils.FormatDate(t.PostedAt.Time),
terminal_utils.WrapText(t.Text, 60),
t.NumReplies,
t.NumRetweets,
t.NumQuoteTweets,
t.NumLikes,
)
if len(t.Images) > 0 {
2022-03-13 17:09:43 -07:00
ret += fmt.Sprintf(terminal_utils.COLOR_GREEN+"images: %d\n"+terminal_utils.COLOR_RESET, len(t.Images))
}
if len(t.Urls) > 0 {
ret += "urls: [\n"
2022-03-13 17:09:43 -07:00
for _, url := range t.Urls {
2021-09-17 18:04:12 -07:00
ret += " " + url.Text + "\n"
}
ret += "]"
}
return ret
2021-05-22 18:20:18 -04:00
}
// Turn an APITweet, as returned from the scraper, into a properly structured Tweet object
2021-05-22 18:20:18 -04:00
func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) {
apiTweet.NormalizeContent()
2021-05-22 18:20:18 -04:00
ret.ID = TweetID(apiTweet.ID)
ret.UserID = UserID(apiTweet.UserID)
ret.UserHandle = UserHandle(apiTweet.UserHandle)
2021-05-22 18:20:18 -04:00
ret.Text = apiTweet.FullText
2021-12-12 16:42:32 -08:00
// Process "posted-at" date and time
2022-03-13 17:09:43 -07:00
if apiTweet.TombstoneText == "" { // Skip time parsing for tombstones
2022-03-06 18:09:43 -08:00
ret.PostedAt, err = TimestampFromString(apiTweet.CreatedAt)
2021-11-06 13:37:46 -07:00
if err != nil {
2022-03-06 19:27:30 -08:00
return Tweet{}, fmt.Errorf("Error parsing time on tweet ID %d:\n %w", ret.ID, err)
2021-11-06 13:37:46 -07:00
}
2021-05-22 18:20:18 -04:00
}
2021-11-06 13:37:46 -07:00
2021-05-22 18:20:18 -04:00
ret.NumLikes = apiTweet.FavoriteCount
ret.NumRetweets = apiTweet.RetweetCount
ret.NumReplies = apiTweet.ReplyCount
ret.NumQuoteTweets = apiTweet.QuoteCount
ret.InReplyToID = TweetID(apiTweet.InReplyToStatusID)
ret.QuotedTweetID = TweetID(apiTweet.QuotedStatusID)
2021-05-22 18:20:18 -04:00
2021-12-12 16:42:32 -08:00
// Process URLs and link previews
for _, url := range apiTweet.Entities.URLs {
2021-09-17 18:04:12 -07:00
var url_object Url
if apiTweet.Card.ShortenedUrl == url.ShortenedUrl {
if apiTweet.Card.Name == "3691233323:audiospace" {
// This "url" is just a link to a Space. Don't process it as a Url
continue
}
2021-09-17 18:04:12 -07:00
url_object = ParseAPIUrlCard(apiTweet.Card)
}
url_object.Text = url.ExpandedURL
2022-02-01 15:48:43 -08:00
url_object.ShortText = url.ShortenedUrl
2021-09-17 18:04:12 -07:00
url_object.TweetID = ret.ID
// Skip it if it's just the quoted tweet
_, id, is_ok := TryParseTweetUrl(url.ExpandedURL)
if is_ok && id == ret.QuotedTweetID {
continue
}
2021-09-17 18:04:12 -07:00
ret.Urls = append(ret.Urls, url_object)
2021-05-22 18:20:18 -04:00
}
2021-12-12 16:42:32 -08:00
// Process images
2021-05-22 18:20:18 -04:00
for _, media := range apiTweet.Entities.Media {
2022-03-13 17:09:43 -07:00
if media.Type != "photo" { // TODO: remove this eventually
2022-03-06 20:31:04 -08:00
panic(fmt.Errorf("Unknown media type %q:\n %w", media.Type, EXTERNAL_API_ERROR))
2021-05-22 18:20:18 -04:00
}
new_image := ParseAPIMedia(media)
new_image.TweetID = ret.ID
ret.Images = append(ret.Images, new_image)
2021-05-22 18:20:18 -04:00
}
2021-12-12 16:42:32 -08:00
// Process hashtags
2021-05-22 18:20:18 -04:00
for _, hashtag := range apiTweet.Entities.Hashtags {
ret.Hashtags = append(ret.Hashtags, hashtag.Text)
}
2021-12-12 16:42:32 -08:00
// Process `@` mentions and reply-mentions
2021-05-22 18:20:18 -04:00
for _, mention := range apiTweet.Entities.Mentions {
2021-07-22 14:16:40 -07:00
ret.Mentions = append(ret.Mentions, UserHandle(mention.UserName))
2021-05-22 18:20:18 -04:00
}
for _, mention := range strings.Split(apiTweet.Entities.ReplyMentions, " ") {
if mention != "" {
if mention[0] != '@' {
2022-03-06 20:31:04 -08:00
panic(fmt.Errorf("Unknown ReplyMention value %q:\n %w", apiTweet.Entities.ReplyMentions, EXTERNAL_API_ERROR))
}
ret.ReplyMentions = append(ret.ReplyMentions, UserHandle(mention[1:]))
}
}
2021-12-12 16:42:32 -08:00
// Process videos
2021-07-25 14:51:17 -07:00
for _, entity := range apiTweet.ExtendedEntities.Media {
2021-10-04 21:06:53 -07:00
if entity.Type != "video" && entity.Type != "animated_gif" {
2021-07-25 14:51:17 -07:00
continue
}
new_video := ParseAPIVideo(entity, ret.ID) // This assigns TweetID
ret.Videos = append(ret.Videos, new_video)
// Remove the thumbnail from the Images list
updated_imgs := []Image{}
for _, img := range ret.Images {
if VideoID(img.ID) != new_video.ID {
updated_imgs = append(updated_imgs, img)
}
2021-07-25 14:51:17 -07:00
}
ret.Images = updated_imgs
2021-07-25 14:51:17 -07:00
}
2021-11-06 13:37:46 -07:00
2021-12-12 16:42:32 -08:00
// Process polls
if strings.Index(apiTweet.Card.Name, "poll") == 0 {
poll := ParseAPIPoll(apiTweet.Card)
poll.TweetID = ret.ID
ret.Polls = []Poll{poll}
}
// Process spaces
if apiTweet.Card.Name == "3691233323:audiospace" {
space := ParseAPISpace(apiTweet.Card)
ret.Spaces = []Space{space}
}
// Process tombstones and other metadata
2021-11-06 13:37:46 -07:00
ret.TombstoneType = apiTweet.TombstoneText
ret.IsStub = !(ret.TombstoneType == "")
2022-03-13 17:09:43 -07:00
ret.LastScrapedAt = TimestampFromUnix(0) // Caller will change this for the tweet that was actually scraped
ret.IsConversationScraped = false // Safe due to the "No Worsening" principle
2021-11-06 13:37:46 -07:00
2021-05-22 18:20:18 -04:00
return
}
2021-06-15 15:18:09 -07:00
/**
* Get a single tweet with no replies from the API.
*
* args:
* - id: the ID of the tweet to get
*
* returns: the single Tweet
*/
2021-07-22 14:16:40 -07:00
func GetTweet(id TweetID) (Tweet, error) {
2021-06-16 19:31:27 -07:00
api := API{}
tweet_response, err := api.GetTweet(id, "")
if err != nil {
2022-03-06 20:31:04 -08:00
return Tweet{}, fmt.Errorf("Error in API call:\n %w", err)
2021-06-16 19:31:27 -07:00
}
single_tweet, ok := tweet_response.GlobalObjects.Tweets[fmt.Sprint(id)]
2021-06-16 19:31:27 -07:00
if !ok {
2022-03-13 16:13:16 -07:00
return Tweet{}, fmt.Errorf("Didn't get the tweet!")
2021-06-16 19:31:27 -07:00
}
return ParseSingleTweet(single_tweet)
}
/**
* Return a list of tweets, including the original and the rest of its thread,
* along with a list of associated users.
*
* Mark the main tweet as "is_conversation_downloaded = true", and update its "last_scraped_at"
* value.
*
* args:
* - id: the ID of the tweet to get
*
* returns: the tweet, list of its replies and context, and users associated with those replies
*/
func GetTweetFull(id TweetID) (trove TweetTrove, err error) {
2021-06-16 19:31:27 -07:00
api := API{}
tweet_response, err := api.GetTweet(id, "")
if err != nil {
2022-03-06 19:27:30 -08:00
err = fmt.Errorf("Error getting tweet: %d\n %w", id, err)
2021-06-16 19:31:27 -07:00
return
}
if len(tweet_response.GlobalObjects.Tweets) < DEFAULT_MAX_REPLIES_EAGER_LOAD &&
2022-03-13 17:09:43 -07:00
tweet_response.GetCursor() != "" {
2021-06-16 19:31:27 -07:00
err = api.GetMoreReplies(id, &tweet_response, DEFAULT_MAX_REPLIES_EAGER_LOAD)
if err != nil {
2022-03-06 19:27:30 -08:00
err = fmt.Errorf("Error getting more tweet replies: %d\n %w", id, err)
2021-06-16 19:31:27 -07:00
return
}
}
// This has to be called BEFORE ParseTweetResponse, because it modifies the TweetResponse (adds tombstone tweets to its tweets list)
tombstoned_users := tweet_response.HandleTombstones()
trove, err = ParseTweetResponse(tweet_response)
if err != nil {
panic(err)
2021-11-06 13:37:46 -07:00
}
trove.TombstoneUsers = tombstoned_users
trove.FetchTombstoneUsers()
// Quoted tombstones need their user_id filled out from the tombstoned_users list
trove.FillMissingUserIDs()
// Find the main tweet and update its "is_conversation_downloaded" and "last_scraped_at"
tweet, ok := trove.Tweets[id]
if !ok {
panic("Trove didn't contain its own tweet!")
}
2022-03-06 18:09:43 -08:00
tweet.LastScrapedAt = Timestamp{time.Now()}
tweet.IsConversationScraped = true
trove.Tweets[id] = tweet
// tweets, retweets, users = trove.Transform()
2021-11-06 13:37:46 -07:00
return
2021-06-16 19:31:27 -07:00
}
/**
* Parse an API response object into a list of tweets, retweets and users
*
* args:
* - resp: the response from the API
*
* returns: a list of tweets, retweets and users in that response object
*/
func ParseTweetResponse(resp TweetResponse) (TweetTrove, error) {
trove := NewTweetTrove()
2021-06-15 15:18:09 -07:00
for _, single_tweet := range resp.GlobalObjects.Tweets {
if single_tweet.RetweetedStatusIDStr == "" {
new_tweet, err := ParseSingleTweet(single_tweet)
2021-06-15 15:18:09 -07:00
if err != nil {
return trove, err
2021-06-15 15:18:09 -07:00
}
trove.Tweets[new_tweet.ID] = new_tweet
2021-06-15 15:18:09 -07:00
} else {
new_retweet, err := ParseSingleRetweet(single_tweet)
2021-06-15 15:18:09 -07:00
if err != nil {
return trove, err
2021-06-15 15:18:09 -07:00
}
trove.Retweets[new_retweet.RetweetID] = new_retweet
2021-06-15 15:18:09 -07:00
}
}
2021-06-15 15:18:09 -07:00
for _, user := range resp.GlobalObjects.Users {
new_user, err := ParseSingleUser(user)
2021-06-15 15:18:09 -07:00
if err != nil {
return trove, err
2021-06-15 15:18:09 -07:00
}
trove.Users[new_user.ID] = new_user
2021-06-15 15:18:09 -07:00
}
return trove, nil
2021-06-15 15:18:09 -07:00
}