2021-05-22 18:20:18 -04:00
package scraper
2021-05-23 20:58:31 -04:00
import (
2021-11-01 14:06:15 -07:00
"fmt"
2022-01-05 21:04:40 -05:00
"html"
2021-05-23 20:58:31 -04:00
"time"
"strings"
"encoding/json"
2021-08-04 14:49:20 -07:00
"strconv"
2021-11-01 14:06:15 -07:00
"sort"
2021-05-23 20:58:31 -04:00
)
2021-05-22 18:20:18 -04:00
2021-08-04 23:41:58 -07:00
type APIMedia struct {
ID int64 ` json:"id_str,string" `
MediaURLHttps string ` json:"media_url_https" `
Type string ` json:"type" `
URL string ` json:"url" `
2021-10-10 16:06:47 -07:00
OriginalInfo struct {
Width int ` json:"width" `
Height int ` json:"height" `
} ` json:"original_info" `
2021-08-04 23:41:58 -07:00
}
2021-07-25 14:51:17 -07:00
type SortableVariants [ ] struct {
Bitrate int ` json:"bitrate,omitempty" `
URL string ` json:"url" `
}
func ( v SortableVariants ) Len ( ) int { return len ( v ) }
func ( v SortableVariants ) Swap ( i , j int ) { v [ i ] , v [ j ] = v [ j ] , v [ i ] }
func ( v SortableVariants ) Less ( i , j int ) bool { return v [ i ] . Bitrate > v [ j ] . Bitrate }
2021-08-04 23:41:58 -07:00
type APIExtendedMedia struct {
2021-08-04 01:23:55 -07:00
ID int64 ` json:"id_str,string" `
2021-08-03 17:34:44 -07:00
MediaURLHttps string ` json:"media_url_https" `
Type string ` json:"type" `
2021-08-04 23:41:58 -07:00
VideoInfo struct {
Variants SortableVariants ` json:"variants" `
2021-12-24 16:26:34 -05:00
Duration int ` json:"duration_millis" `
2021-08-04 23:41:58 -07:00
} ` json:"video_info" `
2021-10-10 16:06:47 -07:00
OriginalInfo struct {
Width int ` json:"width" `
Height int ` json:"height" `
} ` json:"original_info" `
2021-12-24 16:26:34 -05:00
Ext struct {
MediaStats struct {
R interface { } ` json:"r" `
} ` json:"mediaStats" `
} ` json:"ext" `
2021-08-03 17:34:44 -07:00
}
2021-09-17 13:41:43 -07:00
type APICard struct {
2021-09-17 17:35:55 -07:00
Name string ` json:"name" `
2021-09-17 19:45:31 -07:00
ShortenedUrl string ` json:"url" `
2021-09-17 13:41:43 -07:00
BindingValues struct {
Domain struct {
Value string ` json:"string_value" `
} ` json:"domain" `
Creator struct {
UserValue struct {
Value int64 ` json:"id_str,string" `
} ` json:"user_value" `
} ` json:"creator" `
Site struct {
UserValue struct {
Value int64 ` json:"id_str,string" `
} ` json:"user_value" `
} ` json:"site" `
Title struct {
Value string ` json:"string_value" `
} ` json:"title" `
Description struct {
Value string ` json:"string_value" `
} ` json:"description" `
Thumbnail struct {
ImageValue struct {
Url string ` json:"url" `
2021-10-10 16:06:47 -07:00
Width int ` json:"width" `
Height int ` json:"height" `
2021-09-17 13:41:43 -07:00
} ` json:"image_value" `
} ` json:"thumbnail_image_large" `
2021-09-17 17:35:55 -07:00
PlayerImage struct {
ImageValue struct {
Url string ` json:"url" `
} ` json:"image_value" `
} ` json:"player_image_large" `
2021-11-07 21:10:24 -08:00
// For polls
Choice1 struct {
StringValue string ` json:"string_value" `
} ` json:"choice1_label" `
Choice2 struct {
StringValue string ` json:"string_value" `
} ` json:"choice2_label" `
Choice3 struct {
StringValue string ` json:"string_value" `
} ` json:"choice3_label" `
Choice4 struct {
StringValue string ` json:"string_value" `
} ` json:"choice4_label" `
Choice1_Count struct {
StringValue string ` json:"string_value" `
} ` json:"choice1_count" `
Choice2_Count struct {
StringValue string ` json:"string_value" `
} ` json:"choice2_count" `
Choice3_Count struct {
StringValue string ` json:"string_value" `
} ` json:"choice3_count" `
Choice4_Count struct {
StringValue string ` json:"string_value" `
} ` json:"choice4_count" `
EndDatetimeUTC struct {
StringValue string ` json:"string_value" `
} ` json:"end_datetime_utc" `
CountsAreFinal struct {
BooleanValue bool ` json:"boolean_value" `
} ` json:"counts_are_final" `
DurationMinutes struct {
StringValue string ` json:"string_value" `
} ` json:"duration_minutes" `
2021-12-12 16:14:11 -08:00
LastUpdatedAt struct {
StringValue string ` json:"string_value" `
} ` json:"last_updated_datetime_utc" `
2021-09-17 13:41:43 -07:00
} ` json:"binding_values" `
}
2021-05-22 18:20:18 -04:00
type APITweet struct {
2021-08-04 14:49:20 -07:00
ID int64 ` json:"id_str,string" `
ConversationID int64 ` json:"conversation_id_str,string" `
2021-05-22 18:20:18 -04:00
CreatedAt string ` json:"created_at" `
FavoriteCount int ` json:"favorite_count" `
FullText string ` json:"full_text" `
2021-09-27 18:12:28 -07:00
DisplayTextRange [ ] int ` json:"display_text_range" `
2021-05-22 18:20:18 -04:00
Entities struct {
Hashtags [ ] struct {
Text string ` json:"text" `
} ` json:"hashtags" `
2021-08-03 17:34:44 -07:00
Media [ ] APIMedia ` json:"media" `
2021-05-22 18:20:18 -04:00
URLs [ ] struct {
2021-09-17 19:45:31 -07:00
ExpandedURL string ` json:"expanded_url" `
ShortenedUrl string ` json:"url" `
2021-05-22 18:20:18 -04:00
} ` json:"urls" `
Mentions [ ] struct {
UserName string ` json:"screen_name" `
2021-08-04 02:00:58 -07:00
UserID int64 ` json:"id_str,string" `
2021-05-23 20:58:31 -04:00
} ` json:"user_mentions" `
2021-09-27 18:12:28 -07:00
ReplyMentions string // The leading part of the text which is cut off by "DisplayTextRange"
2021-05-22 18:20:18 -04:00
} ` json:"entities" `
ExtendedEntities struct {
2021-08-04 23:41:58 -07:00
Media [ ] APIExtendedMedia ` json:"media" `
2021-05-22 18:20:18 -04:00
} ` json:"extended_entities" `
2021-09-27 18:12:28 -07:00
InReplyToStatusID int64 ` json:"in_reply_to_status_id_str,string" `
2021-11-01 14:06:15 -07:00
InReplyToUserID int64 ` json:"in_reply_to_user_id_str,string" `
2021-09-27 18:12:28 -07:00
InReplyToScreenName string ` json:"in_reply_to_screen_name" `
ReplyCount int ` json:"reply_count" `
RetweetCount int ` json:"retweet_count" `
QuoteCount int ` json:"quote_count" `
RetweetedStatusIDStr string ` json:"retweeted_status_id_str" ` // Can be empty string
RetweetedStatusID int64
QuotedStatusIDStr string ` json:"quoted_status_id_str" ` // Can be empty string
QuotedStatusID int64
QuotedStatusPermalink struct {
2022-02-01 16:56:37 -08:00
ShortURL string ` json:"url" `
2021-09-27 18:12:28 -07:00
ExpandedURL string ` json:"expanded" `
} ` json:"quoted_status_permalink" `
Time time . Time ` json:"time" `
UserID int64 ` json:"user_id_str,string" `
2022-01-07 13:42:00 -05:00
UserHandle string
2021-09-27 18:12:28 -07:00
Card APICard ` json:"card" `
2021-11-01 14:06:15 -07:00
TombstoneText string
2021-05-22 18:20:18 -04:00
}
2021-05-23 20:58:31 -04:00
func ( t * APITweet ) NormalizeContent ( ) {
2021-08-04 14:49:20 -07:00
id , err := strconv . Atoi ( t . QuotedStatusIDStr )
if err == nil {
t . QuotedStatusID = int64 ( id )
}
id , err = strconv . Atoi ( t . RetweetedStatusIDStr )
if err == nil {
t . RetweetedStatusID = int64 ( id )
}
2021-09-27 18:12:28 -07:00
if ( len ( t . DisplayTextRange ) == 2 ) {
t . Entities . ReplyMentions = strings . TrimSpace ( string ( [ ] rune ( t . FullText ) [ 0 : t . DisplayTextRange [ 0 ] ] ) )
t . FullText = string ( [ ] rune ( t . FullText ) [ t . DisplayTextRange [ 0 ] : t . DisplayTextRange [ 1 ] ] )
}
2022-02-01 16:56:37 -08:00
// Handle short links showing up at ends of tweets
for _ , url := range t . Entities . URLs {
index := strings . Index ( t . FullText , url . ShortenedUrl )
if index == ( len ( t . FullText ) - len ( url . ShortenedUrl ) ) {
t . FullText = strings . TrimSpace ( t . FullText [ 0 : index ] )
}
}
2021-09-27 18:12:28 -07:00
// Handle pasted tweet links that turn into quote tweets but still have a link in them
2022-02-01 16:56:37 -08:00
// This is a separate case from above because we want it gone even if it's in the middle of the tweet
2021-09-27 18:12:28 -07:00
if t . QuotedStatusID != 0 {
for _ , url := range t . Entities . URLs {
2022-02-01 16:56:37 -08:00
if url . ShortenedUrl == t . QuotedStatusPermalink . ShortURL {
2021-09-27 18:12:28 -07:00
t . FullText = strings . ReplaceAll ( t . FullText , url . ShortenedUrl , "" )
}
}
}
2022-01-05 21:04:40 -05:00
t . FullText = html . UnescapeString ( t . FullText )
2021-09-27 18:12:28 -07:00
t . FullText = strings . TrimSpace ( t . FullText )
2021-05-23 20:58:31 -04:00
}
func ( t APITweet ) String ( ) string {
data , err := json . Marshal ( t )
if err != nil {
panic ( err )
}
return string ( data )
}
2021-06-13 12:43:34 -07:00
type APIUser struct {
CreatedAt string ` json:"created_at" `
Description string ` json:"description" `
Entities struct {
URL struct {
Urls [ ] struct {
ExpandedURL string ` json:"expanded_url" `
} ` json:"urls" `
} ` json:"url" `
} ` json:"entities" `
FavouritesCount int ` json:"favourites_count" `
FollowersCount int ` json:"followers_count" `
FriendsCount int ` json:"friends_count" `
2021-08-04 02:00:58 -07:00
ID int64 ` json:"id_str,string" `
2021-06-13 12:43:34 -07:00
ListedCount int ` json:"listed_count" `
Name string ` json:"name" `
Location string ` json:"location" `
2021-08-04 23:41:58 -07:00
PinnedTweetIdsStr [ ] string ` json:"pinned_tweet_ids_str" ` // Dunno how to type-convert an array
2021-06-13 12:43:34 -07:00
ProfileBannerURL string ` json:"profile_banner_url" `
ProfileImageURLHTTPS string ` json:"profile_image_url_https" `
Protected bool ` json:"protected" `
ScreenName string ` json:"screen_name" `
StatusesCount int ` json:"statuses_count" `
Verified bool ` json:"verified" `
2022-01-06 14:39:31 -05:00
IsBanned bool
2021-06-13 12:43:34 -07:00
}
type UserResponse struct {
Data struct {
User struct {
2021-08-04 02:00:58 -07:00
ID int64 ` json:"rest_id,string" `
2021-06-13 12:43:34 -07:00
Legacy APIUser ` json:"legacy" `
} ` json:"user" `
} ` json:"data" `
2021-08-22 16:41:59 -07:00
Errors [ ] struct {
Message string ` json:"message" `
2021-08-22 17:55:21 -07:00
Code int ` json:"code" `
2021-08-22 16:41:59 -07:00
} ` json:"errors" `
2021-06-13 12:43:34 -07:00
}
func ( u UserResponse ) ConvertToAPIUser ( ) APIUser {
ret := u . Data . User . Legacy
2021-08-04 02:00:58 -07:00
ret . ID = u . Data . User . ID
2022-01-06 14:39:31 -05:00
// Banned users
for _ , api_error := range u . Errors {
if api_error . Message == "Authorization: User has been suspended. (63)" {
ret . IsBanned = true
} else {
panic ( fmt . Sprintf ( "Unknown api error: %q" , api_error . Message ) )
}
}
2021-06-13 12:43:34 -07:00
return ret
}
2021-11-01 14:06:15 -07:00
type Entry struct {
EntryID string ` json:"entryId" `
SortIndex int64 ` json:"sortIndex,string" `
Content struct {
Item struct {
Content struct {
Tombstone struct {
TombstoneInfo struct {
RichText struct {
Text string ` json:"text" `
} ` json:"richText" `
} ` json:"tombstoneInfo" `
} ` json:"tombstone" `
Tweet struct {
ID int64 ` json:"id,string" `
} ` json:"tweet" `
} ` json:"content" `
} ` json:"item" `
Operation struct {
Cursor struct {
Value string ` json:"value" `
} ` json:"cursor" `
} ` json:"operation" `
} ` json:"content" `
}
func ( e Entry ) GetTombstoneText ( ) string {
return e . Content . Item . Content . Tombstone . TombstoneInfo . RichText . Text
}
type SortableEntries [ ] Entry
func ( e SortableEntries ) Len ( ) int { return len ( e ) }
func ( e SortableEntries ) Swap ( i , j int ) { e [ i ] , e [ j ] = e [ j ] , e [ i ] }
func ( e SortableEntries ) Less ( i , j int ) bool { return e [ i ] . SortIndex > e [ j ] . SortIndex }
2021-05-22 18:20:18 -04:00
type TweetResponse struct {
GlobalObjects struct {
Tweets map [ string ] APITweet ` json:"tweets" `
2021-06-13 12:43:34 -07:00
Users map [ string ] APIUser ` json:"users" `
2021-05-22 18:20:18 -04:00
} ` json:"globalObjects" `
2021-05-23 20:58:31 -04:00
Timeline struct {
Instructions [ ] struct {
AddEntries struct {
2021-11-01 14:10:20 -07:00
Entries SortableEntries ` json:"entries" `
2021-05-23 20:58:31 -04:00
} ` json:"addEntries" `
2021-11-01 14:10:20 -07:00
ReplaceEntry struct {
Entry Entry
} ` json:"replaceEntry" `
2021-05-23 20:58:31 -04:00
} ` json:"instructions" `
} ` json:"timeline" `
2021-05-22 18:20:18 -04:00
}
2021-11-01 14:10:20 -07:00
var tombstone_types = map [ string ] string {
"This Tweet was deleted by the Tweet author. Learn more" : "deleted" ,
2021-12-21 16:05:51 -05:00
"This Tweet is from a suspended account. Learn more" : "suspended" ,
2021-11-01 14:10:20 -07:00
"You’ re unable to view this Tweet because this account owner limits who can view their Tweets. Learn more" : "hidden" ,
2021-11-01 15:47:52 -07:00
"This Tweet is unavailable. Learn more" : "unavailable" ,
2021-12-21 16:05:51 -05:00
"This Tweet violated the Twitter Rules. Learn more" : "violated" ,
"This Tweet is from an account that no longer exists. Learn more" : "no longer exists" ,
2022-02-14 17:15:01 -08:00
"Age-restricted adult content. This content might not be appropriate for people under 18 years old. To view this media, you’ ll need to log in to Twitter. Learn more" : "age-restricted" ,
2021-11-01 14:10:20 -07:00
}
/ * *
* Insert tweets into GlobalObjects for each tombstone . Returns a list of users that need to
* be fetched for tombstones .
* /
2022-01-07 13:42:00 -05:00
func ( t * TweetResponse ) HandleTombstones ( ) [ ] UserHandle {
ret := [ ] UserHandle { }
2021-11-01 14:10:20 -07:00
2022-01-07 13:42:00 -05:00
// Handle tombstones in quote-tweets
for _ , api_tweet := range t . GlobalObjects . Tweets {
// Ignore if tweet doesn't have a quoted tweet
if api_tweet . QuotedStatusIDStr == "" {
continue
}
// Ignore if quoted tweet is in the Global Objects (i.e., not a tombstone)
if _ , ok := t . GlobalObjects . Tweets [ api_tweet . QuotedStatusIDStr ] ; ok {
continue
}
user_handle , err := ParseHandleFromTweetUrl ( api_tweet . QuotedStatusPermalink . ExpandedURL )
if err != nil {
panic ( err )
}
var tombstoned_tweet APITweet
tombstoned_tweet . ID = int64 ( int_or_panic ( api_tweet . QuotedStatusIDStr ) )
tombstoned_tweet . UserHandle = string ( user_handle )
tombstoned_tweet . TombstoneText = "unavailable"
ret = append ( ret , user_handle )
fmt . Printf ( "Adding quoted tombstoned tweet: TweetID %d, handle %q\n" , tombstoned_tweet . ID , tombstoned_tweet . UserHandle )
t . GlobalObjects . Tweets [ api_tweet . QuotedStatusIDStr ] = tombstoned_tweet
}
// Handle tombstones in the conversation flow
2021-11-01 14:10:20 -07:00
entries := t . Timeline . Instructions [ 0 ] . AddEntries . Entries
sort . Sort ( entries )
for i , entry := range entries {
if entry . GetTombstoneText ( ) != "" {
// Try to reconstruct the tombstone tweet
var tombstoned_tweet APITweet
tombstoned_tweet . ID = int64 ( i ) // Set a default to prevent clobbering other tombstones
if i + 1 < len ( entries ) && entries [ i + 1 ] . Content . Item . Content . Tweet . ID != 0 {
next_tweet_id := entries [ i + 1 ] . Content . Item . Content . Tweet . ID
api_tweet , ok := t . GlobalObjects . Tweets [ fmt . Sprint ( next_tweet_id ) ]
if ! ok {
panic ( "Weird situation!" )
}
tombstoned_tweet . ID = api_tweet . InReplyToStatusID
tombstoned_tweet . UserID = api_tweet . InReplyToUserID
2022-01-07 13:42:00 -05:00
ret = append ( ret , UserHandle ( api_tweet . InReplyToScreenName ) )
2021-11-01 14:10:20 -07:00
}
if i - 1 >= 0 && entries [ i - 1 ] . Content . Item . Content . Tweet . ID != 0 {
prev_tweet_id := entries [ i - 1 ] . Content . Item . Content . Tweet . ID
_ , ok := t . GlobalObjects . Tweets [ fmt . Sprint ( prev_tweet_id ) ]
if ! ok {
panic ( "Weird situation 2!" )
}
tombstoned_tweet . InReplyToStatusID = prev_tweet_id
}
short_text , ok := tombstone_types [ entry . GetTombstoneText ( ) ]
if ! ok {
panic ( fmt . Sprintf ( "Unknown tombstone text: %s" , entry . GetTombstoneText ( ) ) )
}
tombstoned_tweet . TombstoneText = short_text
// Add the tombstoned tweet to GlobalObjects
t . GlobalObjects . Tweets [ fmt . Sprint ( tombstoned_tweet . ID ) ] = tombstoned_tweet
}
}
2022-01-07 13:42:00 -05:00
2021-11-01 14:10:20 -07:00
return ret
}
2021-05-23 20:58:31 -04:00
func ( t * TweetResponse ) GetCursor ( ) string {
entries := t . Timeline . Instructions [ 0 ] . AddEntries . Entries
2021-11-06 14:50:39 -07:00
if len ( entries ) > 0 {
last_entry := entries [ len ( entries ) - 1 ]
if strings . Contains ( last_entry . EntryID , "cursor" ) {
return last_entry . Content . Operation . Cursor . Value
}
}
// Next, try the other format ("replaceEntry")
instructions := t . Timeline . Instructions
last_replace_entry := instructions [ len ( instructions ) - 1 ] . ReplaceEntry . Entry
if strings . Contains ( last_replace_entry . EntryID , "cursor" ) {
return last_replace_entry . Content . Operation . Cursor . Value
2021-05-23 20:58:31 -04:00
}
return ""
}
2021-08-04 14:49:20 -07:00
2021-08-22 15:01:46 -07:00
/ * *
* Test for one case of end - of - feed . Cursor increments on each request for some reason , but
* there ' s no new content . This seems to happen when there ' s a pinned tweet .
*
* In this case , we look for an "entries" object that has only cursors in it , and no tweets .
* /
func ( t * TweetResponse ) IsEndOfFeed ( ) bool {
entries := t . Timeline . Instructions [ 0 ] . AddEntries . Entries
if len ( entries ) > 2 {
return false
}
for _ , e := range entries {
if ! strings . Contains ( e . EntryID , "cursor" ) {
return false
}
}
return true
}
2021-08-04 14:49:20 -07:00
func idstr_to_int ( idstr string ) int64 {
id , err := strconv . Atoi ( idstr )
if err != nil {
panic ( err )
}
return int64 ( id )
}