From 0d121669668bd7617f645d93dabf4752204257e3 Mon Sep 17 00:00:00 2001 From: Alessio Date: Fri, 8 Mar 2024 17:22:52 -0800 Subject: [PATCH] Add parsing of DM images, videos and URLs --- pkg/scraper/api_types.go | 1 + pkg/scraper/api_types_dms.go | 29 ++++++++++-- pkg/scraper/api_types_dms_test.go | 74 +++++++++++++++++++++++++++++++ pkg/scraper/dm_message.go | 34 ++++++++++++++ pkg/scraper/image.go | 15 ++++--- pkg/scraper/tweet.go | 3 +- pkg/scraper/url.go | 25 ++++++----- pkg/scraper/video.go | 16 +++---- pkg/scraper/video_test.go | 7 +-- 9 files changed, 167 insertions(+), 37 deletions(-) diff --git a/pkg/scraper/api_types.go b/pkg/scraper/api_types.go index 00d177f..2d668cf 100644 --- a/pkg/scraper/api_types.go +++ b/pkg/scraper/api_types.go @@ -51,6 +51,7 @@ type APIExtendedMedia struct { R interface{} `json:"r"` } `json:"mediaStats"` } `json:"ext"` + URL string `json:"url"` // For DM videos } type APICard struct { diff --git a/pkg/scraper/api_types_dms.go b/pkg/scraper/api_types_dms.go index 05fd24d..052a0b6 100644 --- a/pkg/scraper/api_types_dms.go +++ b/pkg/scraper/api_types_dms.go @@ -28,10 +28,12 @@ type APIDMMessage struct { ReplyData struct { ID int `json:"id,string"` } `json:"reply_data"` - URLs []struct { - Url string `json:"url"` - Indices []int `json:"indices"` - } `json:"urls"` + Entities struct { + URLs []struct { + ExpandedURL string `json:"expanded_url"` + ShortenedUrl string `json:"url"` + } `json:"urls"` + } `json:"entities"` Attachment struct { Tweet struct { Url string `json:"url"` @@ -40,6 +42,9 @@ type APIDMMessage struct { User APIUser `json:"user"` } `json:"status"` } `json:"tweet"` + Photo APIMedia `json:"photo"` + Video APIExtendedMedia `json:"video"` + Card APICard `json:"card"` } `json:"attachment"` } `json:"message_data"` MessageReactions []APIDMReaction `json:"message_reactions"` @@ -47,9 +52,25 @@ type APIDMMessage struct { // Remove embedded tweet short-URLs func (m *APIDMMessage) NormalizeContent() { + // All URLs + for _, url := range m.MessageData.Entities.URLs { + index := strings.Index(m.MessageData.Text, url.ShortenedUrl) + if index == (len(m.MessageData.Text) - len(url.ShortenedUrl)) { + m.MessageData.Text = strings.TrimSpace(m.MessageData.Text[0:index]) + } + } + + // Specific items if m.MessageData.Attachment.Tweet.Status.ID != 0 { m.MessageData.Text = strings.Replace(m.MessageData.Text, m.MessageData.Attachment.Tweet.Url, "", 1) } + if m.MessageData.Attachment.Photo.ID != 0 { + m.MessageData.Text = strings.Replace(m.MessageData.Text, m.MessageData.Attachment.Photo.URL, "", 1) + } + if m.MessageData.Attachment.Video.ID != 0 { + m.MessageData.Text = strings.Replace(m.MessageData.Text, m.MessageData.Attachment.Video.URL, "", 1) + } + m.MessageData.Text = strings.TrimSpace(m.MessageData.Text) } diff --git a/pkg/scraper/api_types_dms_test.go b/pkg/scraper/api_types_dms_test.go index 3f7f1b9..442d17e 100644 --- a/pkg/scraper/api_types_dms_test.go +++ b/pkg/scraper/api_types_dms_test.go @@ -81,6 +81,80 @@ func TestParseAPIDMMessageWithEmbeddedTweet(t *testing.T) { assert.True(is_ok) } +func TestParseAPIDMMessageWithEmbeddedImage(t *testing.T) { + assert := assert.New(t) + data, err := os.ReadFile("test_responses/dms/dm_message_with_image.json") + if err != nil { + panic(err) + } + var api_message APIDMMessage + err = json.Unmarshal(data, &api_message) + require.NoError(t, err) + + trove := api_message.ToDMTrove() + + assert.Len(trove.Messages, 1) + m, is_ok := trove.Messages[DMMessageID(1766224476729995648)] + assert.True(is_ok) + + // Check that the short-URL is stripped + assert.Equal("A gastropub staffed by white college girls and the chefs are all Latino", m.Text) + + assert.Len(m.Images, 1) + assert.Equal(m.ID, m.Images[0].DMMessageID) + assert.Equal("https://ton.twitter.com/1.1/ton/data/dm/1766224476729995648/1766224374648958976/L4Ah1GSh.jpg", m.Images[0].RemoteURL) +} + +func TestParseAPIDMMessageWithEmbeddedVideo(t *testing.T) { + assert := assert.New(t) + data, err := os.ReadFile("test_responses/dms/dm_message_with_video.json") + if err != nil { + panic(err) + } + var api_message APIDMMessage + err = json.Unmarshal(data, &api_message) + require.NoError(t, err) + + trove := api_message.ToDMTrove() + + assert.Len(trove.Messages, 1) + m, is_ok := trove.Messages[DMMessageID(1766248283901776125)] + assert.True(is_ok) + + // Check the short-URL is stripped + assert.Equal("", m.Text) + + assert.Len(m.Videos, 1) + assert.Equal(m.ID, m.Videos[0].DMMessageID) + assert.Equal( + "https://video.twimg.com/dm_video/1766248268416385024/vid/avc1/500x280/edFuZXtEVvem158AjvmJ3SZ_1DdG9cbSoW4fm6cDF1k.mp4?tag=1", + m.Videos[0].RemoteURL) +} + +func TestParseAPIDMMessageWithUrlCard(t *testing.T) { + assert := assert.New(t) + data, err := os.ReadFile("test_responses/dms/dm_message_with_url_card.json") + if err != nil { + panic(err) + } + var api_message APIDMMessage + err = json.Unmarshal(data, &api_message) + require.NoError(t, err) + + trove := api_message.ToDMTrove() + + assert.Len(trove.Messages, 1) + m, is_ok := trove.Messages[DMMessageID(1766255994668191902)] + assert.True(is_ok) + assert.Len(m.Urls, 1) + assert.Equal("You wrote this?", m.Text) + url := m.Urls[0] + assert.Equal(m.ID, url.DMMessageID) + assert.Equal("https://offline-twitter.com/introduction/data-ownership-and-composability/", url.Text) + assert.Equal("offline-twitter.com", url.Domain) + assert.Equal("Data ownership and composability", url.Title) +} + func TestParseAPIDMConversation(t *testing.T) { assert := assert.New(t) data, err := os.ReadFile("test_responses/dms/dm_chat_room.json") diff --git a/pkg/scraper/dm_message.go b/pkg/scraper/dm_message.go index 450f979..9955388 100644 --- a/pkg/scraper/dm_message.go +++ b/pkg/scraper/dm_message.go @@ -29,6 +29,10 @@ type DMMessage struct { InReplyToID DMMessageID `db:"in_reply_to_id"` EmbeddedTweetID TweetID `db:"embedded_tweet_id"` Reactions map[UserID]DMReaction + + Images []Image + Videos []Video + Urls []Url } func ParseAPIDMMessage(message APIDMMessage) DMMessage { @@ -47,5 +51,35 @@ func ParseAPIDMMessage(message APIDMMessage) DMMessage { reacc.DMMessageID = ret.ID ret.Reactions[reacc.SenderID] = reacc } + if message.MessageData.Attachment.Photo.ID != 0 { + new_image := ParseAPIMedia(message.MessageData.Attachment.Photo) + new_image.DMMessageID = ret.ID + ret.Images = []Image{new_image} + } + if message.MessageData.Attachment.Video.ID != 0 { + entity := message.MessageData.Attachment.Video + if entity.Type == "video" || entity.Type == "animated_gif" { + new_video := ParseAPIVideo(entity) + new_video.DMMessageID = ret.ID + ret.Videos = append(ret.Videos, new_video) + } + } + + // Process URLs and link previews + for _, url := range message.MessageData.Entities.URLs { + var new_url Url + if message.MessageData.Attachment.Card.ShortenedUrl == url.ShortenedUrl { + if message.MessageData.Attachment.Card.Name == "3691233323:audiospace" { + // This "url" is just a link to a Space. Don't process it as a Url + continue + } + new_url = ParseAPIUrlCard(message.MessageData.Attachment.Card) + } + new_url.Text = url.ExpandedURL + new_url.ShortText = url.ShortenedUrl + new_url.DMMessageID = ret.ID + ret.Urls = append(ret.Urls, new_url) + } + return ret } diff --git a/pkg/scraper/image.go b/pkg/scraper/image.go index b17e192..e6ededc 100644 --- a/pkg/scraper/image.go +++ b/pkg/scraper/image.go @@ -7,13 +7,14 @@ import ( type ImageID int64 type Image struct { - ID ImageID `db:"id"` - TweetID TweetID `db:"tweet_id"` - Width int `db:"width"` - Height int `db:"height"` - RemoteURL string `db:"remote_url"` - LocalFilename string `db:"local_filename"` - IsDownloaded bool `db:"is_downloaded"` + ID ImageID `db:"id"` + TweetID TweetID `db:"tweet_id"` + DMMessageID DMMessageID `db:"chat_message_id"` + Width int `db:"width"` + Height int `db:"height"` + RemoteURL string `db:"remote_url"` + LocalFilename string `db:"local_filename"` + IsDownloaded bool `db:"is_downloaded"` } func ParseAPIMedia(apiMedia APIMedia) Image { diff --git a/pkg/scraper/tweet.go b/pkg/scraper/tweet.go index 5e39875..e259169 100644 --- a/pkg/scraper/tweet.go +++ b/pkg/scraper/tweet.go @@ -203,7 +203,8 @@ func ParseSingleTweet(apiTweet APITweet) (ret Tweet, err error) { continue } - new_video := ParseAPIVideo(entity, ret.ID) // This assigns TweetID + new_video := ParseAPIVideo(entity) + new_video.TweetID = ret.ID ret.Videos = append(ret.Videos, new_video) // Remove the thumbnail from the Images list diff --git a/pkg/scraper/url.go b/pkg/scraper/url.go index db1c505..8798dea 100644 --- a/pkg/scraper/url.go +++ b/pkg/scraper/url.go @@ -8,18 +8,19 @@ import ( ) type Url struct { - TweetID TweetID `db:"tweet_id"` - Domain string `db:"domain"` - Text string `db:"text"` - ShortText string `db:"short_text"` - Title string `db:"title"` - Description string `db:"description"` - ThumbnailWidth int `db:"thumbnail_width"` - ThumbnailHeight int `db:"thumbnail_height"` - ThumbnailRemoteUrl string `db:"thumbnail_remote_url"` - ThumbnailLocalPath string `db:"thumbnail_local_path"` - CreatorID UserID `db:"creator_id"` - SiteID UserID `db:"site_id"` + TweetID TweetID `db:"tweet_id"` + DMMessageID DMMessageID `db:"chat_message_id"` + Domain string `db:"domain"` + Text string `db:"text"` + ShortText string `db:"short_text"` + Title string `db:"title"` + Description string `db:"description"` + ThumbnailWidth int `db:"thumbnail_width"` + ThumbnailHeight int `db:"thumbnail_height"` + ThumbnailRemoteUrl string `db:"thumbnail_remote_url"` + ThumbnailLocalPath string `db:"thumbnail_local_path"` + CreatorID UserID `db:"creator_id"` + SiteID UserID `db:"site_id"` HasCard bool `db:"has_card"` HasThumbnail bool `db:"has_thumbnail"` diff --git a/pkg/scraper/video.go b/pkg/scraper/video.go index 947128e..25f886a 100644 --- a/pkg/scraper/video.go +++ b/pkg/scraper/video.go @@ -12,12 +12,13 @@ type VideoID int64 // from someone else). type Video struct { - ID VideoID `db:"id"` - TweetID TweetID `db:"tweet_id"` - Width int `db:"width"` - Height int `db:"height"` - RemoteURL string `db:"remote_url"` - LocalFilename string `db:"local_filename"` + ID VideoID `db:"id"` + TweetID TweetID `db:"tweet_id"` + DMMessageID DMMessageID `db:"chat_message_id"` + Width int `db:"width"` + Height int `db:"height"` + RemoteURL string `db:"remote_url"` + LocalFilename string `db:"local_filename"` ThumbnailRemoteUrl string `db:"thumbnail_remote_url"` ThumbnailLocalPath string `db:"thumbnail_local_filename"` @@ -38,7 +39,7 @@ func get_filename(remote_url string) string { return path.Base(u.Path) } -func ParseAPIVideo(apiVideo APIExtendedMedia, tweet_id TweetID) Video { +func ParseAPIVideo(apiVideo APIExtendedMedia) Video { variants := apiVideo.VideoInfo.Variants sort.Sort(variants) video_remote_url := variants[0].URL @@ -66,7 +67,6 @@ func ParseAPIVideo(apiVideo APIExtendedMedia, tweet_id TweetID) Video { return Video{ ID: VideoID(apiVideo.ID), - TweetID: tweet_id, Width: apiVideo.OriginalInfo.Width, Height: apiVideo.OriginalInfo.Height, RemoteURL: video_remote_url, diff --git a/pkg/scraper/video_test.go b/pkg/scraper/video_test.go index a394877..d042442 100644 --- a/pkg/scraper/video_test.go +++ b/pkg/scraper/video_test.go @@ -21,10 +21,8 @@ func TestParseAPIVideo(t *testing.T) { err = json.Unmarshal(data, &apivideo) require.NoError(err) - tweet_id := TweetID(28) - video := ParseAPIVideo(apivideo, tweet_id) + video := ParseAPIVideo(apivideo) assert.Equal(VideoID(1418951950020845568), video.ID) - assert.Equal(tweet_id, video.TweetID) assert.Equal(1280, video.Height) assert.Equal(720, video.Width) assert.Equal("https://video.twimg.com/ext_tw_video/1418951950020845568/pu/vid/720x1280/sm4iL9_f8Lclh0aa.mp4?tag=12", video.RemoteURL) @@ -46,7 +44,6 @@ func TestParseGeoblockedVideo(t *testing.T) { err = json.Unmarshal(data, &apivideo) require.NoError(err) - tweet_id := TweetID(28) - video := ParseAPIVideo(apivideo, tweet_id) + video := ParseAPIVideo(apivideo) assert.True(video.IsGeoblocked) }