From 6ed8beae26da412ac792161286503607ee9c29a0 Mon Sep 17 00:00:00 2001 From: Alessio Date: Sun, 22 Sep 2024 16:22:33 -0700 Subject: [PATCH] Update SaveUser logic to account for duplicate handles, with better handling of deleted / banned users - BUGFIX: deleted / banned users will no longer have all their other info wiped - e.g., follower counts, profile image URLs, bio, etc. --- pkg/persistence/user_queries.go | 160 ++++++++++++++++++++++----- pkg/persistence/user_queries_test.go | 141 ++++++++++++++++++++++- 2 files changed, 271 insertions(+), 30 deletions(-) diff --git a/pkg/persistence/user_queries.go b/pkg/persistence/user_queries.go index 85450e5..dace0b0 100644 --- a/pkg/persistence/user_queries.go +++ b/pkg/persistence/user_queries.go @@ -7,21 +7,47 @@ import ( "path" "github.com/jmoiron/sqlx" + "github.com/mattn/go-sqlite3" "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper" ) +type ErrConflictingUserHandle struct { + ConflictingUserID scraper.UserID +} + +func (e ErrConflictingUserHandle) Error() string { + return fmt.Sprintf("active user with given handle already exists (id: %d)", e.ConflictingUserID) +} + const USERS_ALL_SQL_FIELDS = ` id, display_name, handle, bio, following_count, followers_count, location, website, join_date, is_private, is_verified, is_banned, is_deleted, profile_image_url, profile_image_local_path, banner_image_url, banner_image_local_path, pinned_tweet_id, is_content_downloaded, is_followed` -// Save the given User to the database. -// If the User is already in the database, it will update most of its attributes (follower count, etc) +// User save strategy: // -// args: -// - u: the User +// 1. Check if the user needs a fake ID; if so, assign one +// 2. Try to execute an update +// 2a. if the user is banned or deleted, don't overwrite other fields, blanking them +// 2b. if the user exists but `handle` conflicts with an active user, do conflict handling +// 3. If the user doesn't already exist, execute an insert. Do conflict handling if applicable +// +// Conflict handling: +// +// 1. Look up the ID of the user with conflicting handle +// 2. TODO: handle case where the previous user has a fake ID +// May have to rescrape that user's tweets to figure out if they're the same user or not. +// Strategy 1: assume they're the same users +// - Execute a full update on the old user, including their ID (we have a real ID for them now) +// - Update all the other tables (tweets, follows, lists, etc) with the new ID +// Strategy 2: assume they're different users +// - Mark the old user as deactivated and be done with it +// 3. Mark the old user as deactivated, eliminating the conflict +// 4. Re-save the new user +// 5. Return an ErrConflictingUserHandle, notifying the caller of the conflict func (p Profile) SaveUser(u *scraper.User) error { + // First, check if the user needs a fake ID, and generate one if needed if u.IsNeedingFakeID { // User is fake; check if we already have them, in order to proceed err := p.DB.QueryRow("select id from users_by_handle where lower(handle) = lower(?)", u.Handle).Scan(&u.ID) @@ -39,39 +65,121 @@ func (p Profile) SaveUser(u *scraper.User) error { } } - _, err := p.DB.NamedExec(` + // Handler function to deal with UNIQUE constraint violations on `handle`. + // + // We know the UNIQUE violation must be on `handle`, because we checked for users with this ID + // above (`update` query). + handle_conflict := func() error { + var old_user scraper.User + err := p.DB.Get(&old_user, + `select id, is_id_fake from users where handle = ? and is_banned = 0 and is_deleted = 0`, + u.Handle, + ) + if err != nil { + panic(err) + } + if old_user.IsIdFake { + panic("TODO: user with fake ID") + } else { + // 1. The being-saved user ID doesn't exist yet (or was previously inactive) + // 2. There's another user with the same handle who's currently considered active + // 3. Their ID is not fake. + // 4. The being-saved user is also understood to be active (otherwise a UNIQUE handle + // conflict wouldn't have occurred) + // + // Since we're saving an active user, the old user is presumably no longer active. + // They will need to be rescraped when posssible, to find out what's going on. For + // now, we will just mark them as deleted. + _, err := p.DB.Exec(`update users set is_deleted=1 where id = ?`, old_user.ID) + if err != nil { + panic(err) + } + // Now we can save our new user. Should succeed since the conflict is cleared: + err = p.SaveUser(u) + if err != nil { + panic(err) + } + // Notify caller of the duplicate for rescraping + return ErrConflictingUserHandle{ConflictingUserID: old_user.ID} + } + } + + // Try to treat it like an `update` and see if it works + var result sql.Result + var err error + if u.IsBanned || u.IsDeleted { + // If user is banned or deleted, it's a stub, so don't update other fields + result, err = p.DB.NamedExec(`update users set is_deleted=:is_deleted, is_banned=:is_banned where id = :id`, u) + } else { + // This could be re-activating a previously deleted / banned user + result, err = p.DB.NamedExec(` + update users + set handle=:handle, + bio=:bio, + display_name=:display_name, + following_count=:following_count, + followers_count=:followers_count, + location=:location, + website=:website, + is_private=:is_private, + is_verified=:is_verified, + is_banned=:is_banned, + is_deleted=:is_deleted, + profile_image_url=:profile_image_url, + profile_image_local_path=:profile_image_local_path, + banner_image_url=:banner_image_url, + banner_image_local_path=:banner_image_local_path, + pinned_tweet_id=:pinned_tweet_id, + is_content_downloaded=(is_content_downloaded or :is_content_downloaded) + where id = :id + `, u) + } + if err != nil { + // Check for UNIQUE constraint violation on `handle` field + var sqliteErr sqlite3.Error + is_ok := errors.As(err, &sqliteErr) + if is_ok && sqliteErr.ExtendedCode == sqlite3.ErrConstraintUnique { + return handle_conflict() + } else { + // Unexpected error + return fmt.Errorf("Error executing SaveUser(%s):\n %w", u.Handle, err) + } + } + // If a row was updated, then the User already exists and was updated successfully; we're done + rows_affected, err := result.RowsAffected() + if err != nil { + panic(err) + } + if rows_affected > 0 { + return nil + } + + // It's a new user. Try to insert it: + _, err = p.DB.NamedExec(` insert into users (id, display_name, handle, bio, following_count, followers_count, location, website, join_date, is_private, is_verified, is_banned, is_deleted, profile_image_url, profile_image_local_path, banner_image_url, banner_image_local_path, pinned_tweet_id, is_content_downloaded, is_id_fake) values (:id, :display_name, :handle, :bio, :following_count, :followers_count, :location, :website, :join_date, :is_private, :is_verified, :is_banned, :is_deleted, :profile_image_url, :profile_image_local_path, :banner_image_url, :banner_image_local_path, :pinned_tweet_id, :is_content_downloaded, :is_id_fake) - on conflict do update - set handle=:handle, - bio=:bio, - display_name=:display_name, - following_count=:following_count, - followers_count=:followers_count, - location=:location, - website=:website, - is_private=:is_private, - is_verified=:is_verified, - is_banned=:is_banned, - is_deleted=:is_deleted, - profile_image_url=:profile_image_url, - profile_image_local_path=:profile_image_local_path, - banner_image_url=:banner_image_url, - banner_image_local_path=:banner_image_local_path, - pinned_tweet_id=:pinned_tweet_id, - is_content_downloaded=(is_content_downloaded or :is_content_downloaded) `, u, ) - if err != nil { - return fmt.Errorf("Error executing SaveUser(%s):\n %w", u.Handle, err) + if err == nil { + // It worked; user is inserted, we're done + return nil } - return nil + // If execution reaches this point, then an error has occurred; err is not nil. + // Check if it's a UNIQUE CONSTRAINT FAILED: + var sqliteErr sqlite3.Error + is_ok := errors.As(err, &sqliteErr) + if is_ok && sqliteErr.ExtendedCode == sqlite3.ErrConstraintUnique { // Conflict detected + return handle_conflict() + } else { + // Some other error + return fmt.Errorf("Error executing SaveUser(%s):\n %w", u.Handle, err) + } } // Retrieve a User from the database, by handle. diff --git a/pkg/persistence/user_queries_test.go b/pkg/persistence/user_queries_test.go index c3232e5..39800f7 100644 --- a/pkg/persistence/user_queries_test.go +++ b/pkg/persistence/user_queries_test.go @@ -11,6 +11,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "gitlab.com/offline-twitter/twitter_offline_engine/pkg/persistence" "gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper" ) @@ -52,10 +53,9 @@ func TestModifyUser(t *testing.T) { user := create_dummy_user() user.DisplayName = "Display Name 1" user.Location = "location1" - user.Handle = "handle 1" + user.Handle = scraper.UserHandle(fmt.Sprintf("handle %d", rand.Int31())) user.IsPrivate = false user.IsVerified = false - user.IsBanned = false user.FollowersCount = 1000 user.JoinDate = scraper.TimestampFromUnix(1000) user.ProfileImageUrl = "asdf" @@ -72,7 +72,6 @@ func TestModifyUser(t *testing.T) { user.Handle = new_handle user.IsPrivate = true user.IsVerified = true - user.IsBanned = true user.FollowersCount = 2000 user.JoinDate = scraper.TimestampFromUnix(2000) user.ProfileImageUrl = "asdf2" @@ -91,13 +90,147 @@ func TestModifyUser(t *testing.T) { assert.Equal("location2", new_user.Location) assert.True(new_user.IsPrivate) assert.True(new_user.IsVerified) - assert.True(new_user.IsBanned) assert.Equal(2000, new_user.FollowersCount) assert.Equal(int64(1000), new_user.JoinDate.Unix()) assert.Equal("asdf2", new_user.ProfileImageUrl) assert.True(new_user.IsContentDownloaded) } +func TestSetUserBannedDeleted(t *testing.T) { + assert := assert.New(t) + require := require.New(t) + + profile_path := "test_profiles/TestUserQueries" + profile := create_or_load_profile(profile_path) + + user := create_dummy_user() + user.DisplayName = "Display Name 1" + user.Location = "location1" + user.Bio = "Some Bio" + user.Handle = scraper.UserHandle(fmt.Sprintf("handle %d", rand.Int31())) + user.FollowersCount = 1000 + user.JoinDate = scraper.TimestampFromUnix(1000) + user.ProfileImageUrl = "asdf" + user.IsContentDownloaded = true + + // Save the user so it can be modified + fmt.Println("---------- Saving the user for the first time; should do insert") + err := profile.SaveUser(&user) + require.NoError(err) + + // Now the user deactivates + user.IsDeleted = true + fmt.Println("---------- Saving the user for the second time; should do update") + err = profile.SaveUser(&user) + require.NoError(err) + // Reload the modified user + new_user, err := profile.GetUserByID(user.ID) + require.NoError(err) + + assert.True(new_user.IsDeleted) + assert.Equal(new_user.Handle, user.Handle) + assert.Equal(new_user.DisplayName, user.DisplayName) + assert.Equal(new_user.Location, user.Location) + assert.Equal(new_user.Bio, user.Bio) + assert.Equal(new_user.FollowersCount, user.FollowersCount) + assert.Equal(new_user.JoinDate, user.JoinDate) + assert.Equal(new_user.ProfileImageUrl, user.ProfileImageUrl) + assert.Equal(new_user.IsContentDownloaded, user.IsContentDownloaded) +} + +func TestSaveAndLoadBannedDeletedUser(t *testing.T) { + require := require.New(t) + + profile_path := "test_profiles/TestUserQueries" + profile := create_or_load_profile(profile_path) + + user := scraper.User{ + ID: scraper.UserID(rand.Int31()), + Handle: scraper.UserHandle(fmt.Sprintf("handle-%d", rand.Int31())), + IsBanned: true, + } + + // Save the user, then reload it and ensure it's the same + err := profile.SaveUser(&user) + require.NoError(err) + new_user, err := profile.GetUserByID(user.ID) + require.NoError(err) + + if diff := deep.Equal(new_user, user); diff != nil { + t.Error(diff) + } +} + +func TestInsertUserWithDuplicateHandle(t *testing.T) { + require := require.New(t) + profile_path := "test_profiles/TestUserQueries" + profile := create_or_load_profile(profile_path) + + user1 := create_dummy_user() + err := profile.SaveUser(&user1) + require.NoError(err) + + user2 := create_dummy_user() + user2.Handle = user1.Handle + err = profile.SaveUser(&user2) + var conflict_err persistence.ErrConflictingUserHandle + require.ErrorAs(err, &conflict_err) + require.Equal(conflict_err.ConflictingUserID, user1.ID) + + // Should have inserted user2 + new_user2, err := profile.GetUserByID(user2.ID) + require.NoError(err) + if diff := deep.Equal(user2, new_user2); diff != nil { + t.Error(diff) + } + + // user1 should be marked as deleted + new_user1, err := profile.GetUserByID(user1.ID) + require.NoError(err) + user1.IsDeleted = true // for equality check + if diff := deep.Equal(user1, new_user1); diff != nil { + t.Error(diff) + } +} + +func TestReviveDeletedUserWithDuplicateHandle(t *testing.T) { + require := require.New(t) + assert := assert.New(t) + profile_path := "test_profiles/TestUserQueries" + profile := create_or_load_profile(profile_path) + + // Create the first user (deleted) + user1 := create_dummy_user() + user1.DisplayName = "user1" + user1.IsDeleted = true + err := profile.SaveUser(&user1) + require.NoError(err) + + // Create the second user (active) + user2 := create_dummy_user() + user2.Handle = user1.Handle + user2.DisplayName = "user2" + err = profile.SaveUser(&user2) + require.NoError(err) + + // Reactivate the 1st user; should return the 2nd user's ID as a conflict + user1.IsDeleted = false + err = profile.SaveUser(&user1) + var conflict_err persistence.ErrConflictingUserHandle + require.ErrorAs(err, &conflict_err) + require.Equal(conflict_err.ConflictingUserID, user2.ID) + + // User1 should be updated (no longer deleted) + new_user1, err := profile.GetUserByID(user1.ID) + require.NoError(err) + assert.False(new_user1.IsDeleted) + // User2 should be marked deleted + new_user2, err := profile.GetUserByID(user2.ID) + require.NoError(err) + assert.True(new_user2.IsDeleted) +} + +// `profile.GetUserByHandle` should be case-insensitive func TestHandleIsCaseInsensitive(t *testing.T) { profile_path := "test_profiles/TestUserQueries" profile := create_or_load_profile(profile_path)