Update SaveUser logic to account for duplicate handles, with better handling of deleted / banned users
- BUGFIX: deleted / banned users will no longer have all their other info wiped - e.g., follower counts, profile image URLs, bio, etc.
This commit is contained in:
parent
79033cfc79
commit
6ed8beae26
@ -7,21 +7,47 @@ import (
|
|||||||
"path"
|
"path"
|
||||||
|
|
||||||
"github.com/jmoiron/sqlx"
|
"github.com/jmoiron/sqlx"
|
||||||
|
"github.com/mattn/go-sqlite3"
|
||||||
|
|
||||||
"gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper"
|
"gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type ErrConflictingUserHandle struct {
|
||||||
|
ConflictingUserID scraper.UserID
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e ErrConflictingUserHandle) Error() string {
|
||||||
|
return fmt.Sprintf("active user with given handle already exists (id: %d)", e.ConflictingUserID)
|
||||||
|
}
|
||||||
|
|
||||||
const USERS_ALL_SQL_FIELDS = `
|
const USERS_ALL_SQL_FIELDS = `
|
||||||
id, display_name, handle, bio, following_count, followers_count, location, website, join_date, is_private, is_verified,
|
id, display_name, handle, bio, following_count, followers_count, location, website, join_date, is_private, is_verified,
|
||||||
is_banned, is_deleted, profile_image_url, profile_image_local_path, banner_image_url, banner_image_local_path,
|
is_banned, is_deleted, profile_image_url, profile_image_local_path, banner_image_url, banner_image_local_path,
|
||||||
pinned_tweet_id, is_content_downloaded, is_followed`
|
pinned_tweet_id, is_content_downloaded, is_followed`
|
||||||
|
|
||||||
// Save the given User to the database.
|
// User save strategy:
|
||||||
// If the User is already in the database, it will update most of its attributes (follower count, etc)
|
|
||||||
//
|
//
|
||||||
// args:
|
// 1. Check if the user needs a fake ID; if so, assign one
|
||||||
// - u: the User
|
// 2. Try to execute an update
|
||||||
|
// 2a. if the user is banned or deleted, don't overwrite other fields, blanking them
|
||||||
|
// 2b. if the user exists but `handle` conflicts with an active user, do conflict handling
|
||||||
|
// 3. If the user doesn't already exist, execute an insert. Do conflict handling if applicable
|
||||||
|
//
|
||||||
|
// Conflict handling:
|
||||||
|
//
|
||||||
|
// 1. Look up the ID of the user with conflicting handle
|
||||||
|
// 2. TODO: handle case where the previous user has a fake ID
|
||||||
|
// May have to rescrape that user's tweets to figure out if they're the same user or not.
|
||||||
|
// Strategy 1: assume they're the same users
|
||||||
|
// - Execute a full update on the old user, including their ID (we have a real ID for them now)
|
||||||
|
// - Update all the other tables (tweets, follows, lists, etc) with the new ID
|
||||||
|
// Strategy 2: assume they're different users
|
||||||
|
// - Mark the old user as deactivated and be done with it
|
||||||
|
// 3. Mark the old user as deactivated, eliminating the conflict
|
||||||
|
// 4. Re-save the new user
|
||||||
|
// 5. Return an ErrConflictingUserHandle, notifying the caller of the conflict
|
||||||
func (p Profile) SaveUser(u *scraper.User) error {
|
func (p Profile) SaveUser(u *scraper.User) error {
|
||||||
|
// First, check if the user needs a fake ID, and generate one if needed
|
||||||
if u.IsNeedingFakeID {
|
if u.IsNeedingFakeID {
|
||||||
// User is fake; check if we already have them, in order to proceed
|
// User is fake; check if we already have them, in order to proceed
|
||||||
err := p.DB.QueryRow("select id from users_by_handle where lower(handle) = lower(?)", u.Handle).Scan(&u.ID)
|
err := p.DB.QueryRow("select id from users_by_handle where lower(handle) = lower(?)", u.Handle).Scan(&u.ID)
|
||||||
@ -39,14 +65,55 @@ func (p Profile) SaveUser(u *scraper.User) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
_, err := p.DB.NamedExec(`
|
// Handler function to deal with UNIQUE constraint violations on `handle`.
|
||||||
insert into users (id, display_name, handle, bio, following_count, followers_count, location, website, join_date, is_private,
|
//
|
||||||
is_verified, is_banned, is_deleted, profile_image_url, profile_image_local_path, banner_image_url,
|
// We know the UNIQUE violation must be on `handle`, because we checked for users with this ID
|
||||||
banner_image_local_path, pinned_tweet_id, is_content_downloaded, is_id_fake)
|
// above (`update` query).
|
||||||
values (:id, :display_name, :handle, :bio, :following_count, :followers_count, :location, :website, :join_date, :is_private,
|
handle_conflict := func() error {
|
||||||
:is_verified, :is_banned, :is_deleted, :profile_image_url, :profile_image_local_path, :banner_image_url,
|
var old_user scraper.User
|
||||||
:banner_image_local_path, :pinned_tweet_id, :is_content_downloaded, :is_id_fake)
|
err := p.DB.Get(&old_user,
|
||||||
on conflict do update
|
`select id, is_id_fake from users where handle = ? and is_banned = 0 and is_deleted = 0`,
|
||||||
|
u.Handle,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
if old_user.IsIdFake {
|
||||||
|
panic("TODO: user with fake ID")
|
||||||
|
} else {
|
||||||
|
// 1. The being-saved user ID doesn't exist yet (or was previously inactive)
|
||||||
|
// 2. There's another user with the same handle who's currently considered active
|
||||||
|
// 3. Their ID is not fake.
|
||||||
|
// 4. The being-saved user is also understood to be active (otherwise a UNIQUE handle
|
||||||
|
// conflict wouldn't have occurred)
|
||||||
|
//
|
||||||
|
// Since we're saving an active user, the old user is presumably no longer active.
|
||||||
|
// They will need to be rescraped when posssible, to find out what's going on. For
|
||||||
|
// now, we will just mark them as deleted.
|
||||||
|
_, err := p.DB.Exec(`update users set is_deleted=1 where id = ?`, old_user.ID)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
// Now we can save our new user. Should succeed since the conflict is cleared:
|
||||||
|
err = p.SaveUser(u)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
// Notify caller of the duplicate for rescraping
|
||||||
|
return ErrConflictingUserHandle{ConflictingUserID: old_user.ID}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to treat it like an `update` and see if it works
|
||||||
|
var result sql.Result
|
||||||
|
var err error
|
||||||
|
if u.IsBanned || u.IsDeleted {
|
||||||
|
// If user is banned or deleted, it's a stub, so don't update other fields
|
||||||
|
result, err = p.DB.NamedExec(`update users set is_deleted=:is_deleted, is_banned=:is_banned where id = :id`, u)
|
||||||
|
} else {
|
||||||
|
// This could be re-activating a previously deleted / banned user
|
||||||
|
result, err = p.DB.NamedExec(`
|
||||||
|
update users
|
||||||
set handle=:handle,
|
set handle=:handle,
|
||||||
bio=:bio,
|
bio=:bio,
|
||||||
display_name=:display_name,
|
display_name=:display_name,
|
||||||
@ -64,14 +131,55 @@ func (p Profile) SaveUser(u *scraper.User) error {
|
|||||||
banner_image_local_path=:banner_image_local_path,
|
banner_image_local_path=:banner_image_local_path,
|
||||||
pinned_tweet_id=:pinned_tweet_id,
|
pinned_tweet_id=:pinned_tweet_id,
|
||||||
is_content_downloaded=(is_content_downloaded or :is_content_downloaded)
|
is_content_downloaded=(is_content_downloaded or :is_content_downloaded)
|
||||||
|
where id = :id
|
||||||
|
`, u)
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
// Check for UNIQUE constraint violation on `handle` field
|
||||||
|
var sqliteErr sqlite3.Error
|
||||||
|
is_ok := errors.As(err, &sqliteErr)
|
||||||
|
if is_ok && sqliteErr.ExtendedCode == sqlite3.ErrConstraintUnique {
|
||||||
|
return handle_conflict()
|
||||||
|
} else {
|
||||||
|
// Unexpected error
|
||||||
|
return fmt.Errorf("Error executing SaveUser(%s):\n %w", u.Handle, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// If a row was updated, then the User already exists and was updated successfully; we're done
|
||||||
|
rows_affected, err := result.RowsAffected()
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
if rows_affected > 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// It's a new user. Try to insert it:
|
||||||
|
_, err = p.DB.NamedExec(`
|
||||||
|
insert into users (id, display_name, handle, bio, following_count, followers_count, location, website, join_date, is_private,
|
||||||
|
is_verified, is_banned, is_deleted, profile_image_url, profile_image_local_path, banner_image_url,
|
||||||
|
banner_image_local_path, pinned_tweet_id, is_content_downloaded, is_id_fake)
|
||||||
|
values (:id, :display_name, :handle, :bio, :following_count, :followers_count, :location, :website, :join_date, :is_private,
|
||||||
|
:is_verified, :is_banned, :is_deleted, :profile_image_url, :profile_image_local_path, :banner_image_url,
|
||||||
|
:banner_image_local_path, :pinned_tweet_id, :is_content_downloaded, :is_id_fake)
|
||||||
`,
|
`,
|
||||||
u,
|
u,
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err == nil {
|
||||||
return fmt.Errorf("Error executing SaveUser(%s):\n %w", u.Handle, err)
|
// It worked; user is inserted, we're done
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
// If execution reaches this point, then an error has occurred; err is not nil.
|
||||||
|
// Check if it's a UNIQUE CONSTRAINT FAILED:
|
||||||
|
var sqliteErr sqlite3.Error
|
||||||
|
is_ok := errors.As(err, &sqliteErr)
|
||||||
|
if is_ok && sqliteErr.ExtendedCode == sqlite3.ErrConstraintUnique { // Conflict detected
|
||||||
|
return handle_conflict()
|
||||||
|
} else {
|
||||||
|
// Some other error
|
||||||
|
return fmt.Errorf("Error executing SaveUser(%s):\n %w", u.Handle, err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Retrieve a User from the database, by handle.
|
// Retrieve a User from the database, by handle.
|
||||||
|
@ -11,6 +11,7 @@ import (
|
|||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
|
"gitlab.com/offline-twitter/twitter_offline_engine/pkg/persistence"
|
||||||
"gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper"
|
"gitlab.com/offline-twitter/twitter_offline_engine/pkg/scraper"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -52,10 +53,9 @@ func TestModifyUser(t *testing.T) {
|
|||||||
user := create_dummy_user()
|
user := create_dummy_user()
|
||||||
user.DisplayName = "Display Name 1"
|
user.DisplayName = "Display Name 1"
|
||||||
user.Location = "location1"
|
user.Location = "location1"
|
||||||
user.Handle = "handle 1"
|
user.Handle = scraper.UserHandle(fmt.Sprintf("handle %d", rand.Int31()))
|
||||||
user.IsPrivate = false
|
user.IsPrivate = false
|
||||||
user.IsVerified = false
|
user.IsVerified = false
|
||||||
user.IsBanned = false
|
|
||||||
user.FollowersCount = 1000
|
user.FollowersCount = 1000
|
||||||
user.JoinDate = scraper.TimestampFromUnix(1000)
|
user.JoinDate = scraper.TimestampFromUnix(1000)
|
||||||
user.ProfileImageUrl = "asdf"
|
user.ProfileImageUrl = "asdf"
|
||||||
@ -72,7 +72,6 @@ func TestModifyUser(t *testing.T) {
|
|||||||
user.Handle = new_handle
|
user.Handle = new_handle
|
||||||
user.IsPrivate = true
|
user.IsPrivate = true
|
||||||
user.IsVerified = true
|
user.IsVerified = true
|
||||||
user.IsBanned = true
|
|
||||||
user.FollowersCount = 2000
|
user.FollowersCount = 2000
|
||||||
user.JoinDate = scraper.TimestampFromUnix(2000)
|
user.JoinDate = scraper.TimestampFromUnix(2000)
|
||||||
user.ProfileImageUrl = "asdf2"
|
user.ProfileImageUrl = "asdf2"
|
||||||
@ -91,13 +90,147 @@ func TestModifyUser(t *testing.T) {
|
|||||||
assert.Equal("location2", new_user.Location)
|
assert.Equal("location2", new_user.Location)
|
||||||
assert.True(new_user.IsPrivate)
|
assert.True(new_user.IsPrivate)
|
||||||
assert.True(new_user.IsVerified)
|
assert.True(new_user.IsVerified)
|
||||||
assert.True(new_user.IsBanned)
|
|
||||||
assert.Equal(2000, new_user.FollowersCount)
|
assert.Equal(2000, new_user.FollowersCount)
|
||||||
assert.Equal(int64(1000), new_user.JoinDate.Unix())
|
assert.Equal(int64(1000), new_user.JoinDate.Unix())
|
||||||
assert.Equal("asdf2", new_user.ProfileImageUrl)
|
assert.Equal("asdf2", new_user.ProfileImageUrl)
|
||||||
assert.True(new_user.IsContentDownloaded)
|
assert.True(new_user.IsContentDownloaded)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestSetUserBannedDeleted(t *testing.T) {
|
||||||
|
assert := assert.New(t)
|
||||||
|
require := require.New(t)
|
||||||
|
|
||||||
|
profile_path := "test_profiles/TestUserQueries"
|
||||||
|
profile := create_or_load_profile(profile_path)
|
||||||
|
|
||||||
|
user := create_dummy_user()
|
||||||
|
user.DisplayName = "Display Name 1"
|
||||||
|
user.Location = "location1"
|
||||||
|
user.Bio = "Some Bio"
|
||||||
|
user.Handle = scraper.UserHandle(fmt.Sprintf("handle %d", rand.Int31()))
|
||||||
|
user.FollowersCount = 1000
|
||||||
|
user.JoinDate = scraper.TimestampFromUnix(1000)
|
||||||
|
user.ProfileImageUrl = "asdf"
|
||||||
|
user.IsContentDownloaded = true
|
||||||
|
|
||||||
|
// Save the user so it can be modified
|
||||||
|
fmt.Println("---------- Saving the user for the first time; should do insert")
|
||||||
|
err := profile.SaveUser(&user)
|
||||||
|
require.NoError(err)
|
||||||
|
|
||||||
|
// Now the user deactivates
|
||||||
|
user.IsDeleted = true
|
||||||
|
fmt.Println("---------- Saving the user for the second time; should do update")
|
||||||
|
err = profile.SaveUser(&user)
|
||||||
|
require.NoError(err)
|
||||||
|
// Reload the modified user
|
||||||
|
new_user, err := profile.GetUserByID(user.ID)
|
||||||
|
require.NoError(err)
|
||||||
|
|
||||||
|
assert.True(new_user.IsDeleted)
|
||||||
|
assert.Equal(new_user.Handle, user.Handle)
|
||||||
|
assert.Equal(new_user.DisplayName, user.DisplayName)
|
||||||
|
assert.Equal(new_user.Location, user.Location)
|
||||||
|
assert.Equal(new_user.Bio, user.Bio)
|
||||||
|
assert.Equal(new_user.FollowersCount, user.FollowersCount)
|
||||||
|
assert.Equal(new_user.JoinDate, user.JoinDate)
|
||||||
|
assert.Equal(new_user.ProfileImageUrl, user.ProfileImageUrl)
|
||||||
|
assert.Equal(new_user.IsContentDownloaded, user.IsContentDownloaded)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSaveAndLoadBannedDeletedUser(t *testing.T) {
|
||||||
|
require := require.New(t)
|
||||||
|
|
||||||
|
profile_path := "test_profiles/TestUserQueries"
|
||||||
|
profile := create_or_load_profile(profile_path)
|
||||||
|
|
||||||
|
user := scraper.User{
|
||||||
|
ID: scraper.UserID(rand.Int31()),
|
||||||
|
Handle: scraper.UserHandle(fmt.Sprintf("handle-%d", rand.Int31())),
|
||||||
|
IsBanned: true,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save the user, then reload it and ensure it's the same
|
||||||
|
err := profile.SaveUser(&user)
|
||||||
|
require.NoError(err)
|
||||||
|
new_user, err := profile.GetUserByID(user.ID)
|
||||||
|
require.NoError(err)
|
||||||
|
|
||||||
|
if diff := deep.Equal(new_user, user); diff != nil {
|
||||||
|
t.Error(diff)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestInsertUserWithDuplicateHandle(t *testing.T) {
|
||||||
|
require := require.New(t)
|
||||||
|
profile_path := "test_profiles/TestUserQueries"
|
||||||
|
profile := create_or_load_profile(profile_path)
|
||||||
|
|
||||||
|
user1 := create_dummy_user()
|
||||||
|
err := profile.SaveUser(&user1)
|
||||||
|
require.NoError(err)
|
||||||
|
|
||||||
|
user2 := create_dummy_user()
|
||||||
|
user2.Handle = user1.Handle
|
||||||
|
err = profile.SaveUser(&user2)
|
||||||
|
var conflict_err persistence.ErrConflictingUserHandle
|
||||||
|
require.ErrorAs(err, &conflict_err)
|
||||||
|
require.Equal(conflict_err.ConflictingUserID, user1.ID)
|
||||||
|
|
||||||
|
// Should have inserted user2
|
||||||
|
new_user2, err := profile.GetUserByID(user2.ID)
|
||||||
|
require.NoError(err)
|
||||||
|
if diff := deep.Equal(user2, new_user2); diff != nil {
|
||||||
|
t.Error(diff)
|
||||||
|
}
|
||||||
|
|
||||||
|
// user1 should be marked as deleted
|
||||||
|
new_user1, err := profile.GetUserByID(user1.ID)
|
||||||
|
require.NoError(err)
|
||||||
|
user1.IsDeleted = true // for equality check
|
||||||
|
if diff := deep.Equal(user1, new_user1); diff != nil {
|
||||||
|
t.Error(diff)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestReviveDeletedUserWithDuplicateHandle(t *testing.T) {
|
||||||
|
require := require.New(t)
|
||||||
|
assert := assert.New(t)
|
||||||
|
profile_path := "test_profiles/TestUserQueries"
|
||||||
|
profile := create_or_load_profile(profile_path)
|
||||||
|
|
||||||
|
// Create the first user (deleted)
|
||||||
|
user1 := create_dummy_user()
|
||||||
|
user1.DisplayName = "user1"
|
||||||
|
user1.IsDeleted = true
|
||||||
|
err := profile.SaveUser(&user1)
|
||||||
|
require.NoError(err)
|
||||||
|
|
||||||
|
// Create the second user (active)
|
||||||
|
user2 := create_dummy_user()
|
||||||
|
user2.Handle = user1.Handle
|
||||||
|
user2.DisplayName = "user2"
|
||||||
|
err = profile.SaveUser(&user2)
|
||||||
|
require.NoError(err)
|
||||||
|
|
||||||
|
// Reactivate the 1st user; should return the 2nd user's ID as a conflict
|
||||||
|
user1.IsDeleted = false
|
||||||
|
err = profile.SaveUser(&user1)
|
||||||
|
var conflict_err persistence.ErrConflictingUserHandle
|
||||||
|
require.ErrorAs(err, &conflict_err)
|
||||||
|
require.Equal(conflict_err.ConflictingUserID, user2.ID)
|
||||||
|
|
||||||
|
// User1 should be updated (no longer deleted)
|
||||||
|
new_user1, err := profile.GetUserByID(user1.ID)
|
||||||
|
require.NoError(err)
|
||||||
|
assert.False(new_user1.IsDeleted)
|
||||||
|
// User2 should be marked deleted
|
||||||
|
new_user2, err := profile.GetUserByID(user2.ID)
|
||||||
|
require.NoError(err)
|
||||||
|
assert.True(new_user2.IsDeleted)
|
||||||
|
}
|
||||||
|
|
||||||
|
// `profile.GetUserByHandle` should be case-insensitive
|
||||||
func TestHandleIsCaseInsensitive(t *testing.T) {
|
func TestHandleIsCaseInsensitive(t *testing.T) {
|
||||||
profile_path := "test_profiles/TestUserQueries"
|
profile_path := "test_profiles/TestUserQueries"
|
||||||
profile := create_or_load_profile(profile_path)
|
profile := create_or_load_profile(profile_path)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user