187 lines
5.1 KiB
Go
Raw Normal View History

2021-05-22 18:20:18 -04:00
package scraper
2021-06-13 14:34:20 -07:00
import (
"time"
"fmt"
2021-07-22 14:16:40 -07:00
"strings"
"regexp"
"path"
"offline_twitter/terminal_utils"
2021-06-13 14:34:20 -07:00
)
type UserID int64
2021-06-16 13:14:56 -07:00
type UserHandle string
2021-06-13 14:34:20 -07:00
2021-07-22 14:16:40 -07:00
func JoinArrayOfHandles(handles []UserHandle) string {
2021-06-16 19:31:27 -07:00
ret := []string{}
2021-07-22 14:16:40 -07:00
for _, h := range handles {
ret = append(ret, string(h))
2021-06-16 19:31:27 -07:00
}
2021-07-22 14:16:40 -07:00
return strings.Join(ret, ",")
2021-06-16 19:31:27 -07:00
}
2021-06-13 14:34:20 -07:00
type User struct {
ID UserID
DisplayName string
Handle UserHandle
Bio string
FollowingCount int
FollowersCount int
Location string
Website string
JoinDate time.Time
IsPrivate bool
IsVerified bool
2022-01-06 14:39:31 -05:00
IsBanned bool
ProfileImageUrl string
ProfileImageLocalPath string
BannerImageUrl string
BannerImageLocalPath string
PinnedTweetID TweetID
PinnedTweet *Tweet
2021-08-07 16:51:38 -07:00
IsContentDownloaded bool
2021-06-13 14:34:20 -07:00
}
func (u User) String() string {
var verified string
if u.IsVerified {
verified = "[\u2713]"
}
ret := fmt.Sprintf(
`%s%s
@%s
%s
Following: %d Followers: %d
Joined %s
%s
%s
`,
u.DisplayName,
verified,
u.Handle,
terminal_utils.WrapText(u.Bio, 60),
u.FollowingCount,
u.FollowersCount,
terminal_utils.FormatDate(u.JoinDate),
u.Location,
u.Website,
)
if u.PinnedTweet != nil {
ret += "\n" + terminal_utils.WrapText(u.PinnedTweet.Text, 60)
} else {
println("Pinned tweet id:", u.PinnedTweetID)
}
return ret
2021-06-13 14:34:20 -07:00
}
2022-01-06 13:43:22 -05:00
/**
* Given a tweet URL, return the corresponding user handle.
* If tweet url is not valid, return an error.
*/
func ParseHandleFromTweetUrl(tweet_url string) (UserHandle, error) {
r := regexp.MustCompile(`https://twitter.com/(\w+)/status/\d+`)
matches := r.FindStringSubmatch(tweet_url)
if len(matches) != 2 { // matches[0] is the full string
return "", fmt.Errorf("Invalid tweet url: %s", tweet_url)
}
return UserHandle(matches[1]), nil
}
2021-06-13 14:34:20 -07:00
// Turn an APIUser, as returned from the scraper, into a properly structured User object
func ParseSingleUser(apiUser APIUser) (ret User, err error) {
ret.ID = UserID(apiUser.ID)
2022-01-06 14:39:31 -05:00
if apiUser.IsBanned {
// Banned users won't have any further info, so just return here
ret.IsBanned = true
return
}
2021-06-13 14:34:20 -07:00
ret.DisplayName = apiUser.Name
2021-06-16 13:14:56 -07:00
ret.Handle = UserHandle(apiUser.ScreenName)
2021-06-13 14:34:20 -07:00
ret.Bio = apiUser.Description
ret.FollowingCount = apiUser.FriendsCount
ret.FollowersCount = apiUser.FollowersCount
ret.Location = apiUser.Location
if len(apiUser.Entities.URL.Urls) > 0 {
ret.Website = apiUser.Entities.URL.Urls[0].ExpandedURL
}
ret.JoinDate, err = time.Parse(time.RubyDate, apiUser.CreatedAt)
if err != nil {
return
}
ret.IsPrivate = apiUser.Protected
ret.IsVerified = apiUser.Verified
ret.ProfileImageUrl = apiUser.ProfileImageURLHTTPS
2021-08-10 22:08:01 -07:00
if regexp.MustCompile(`_normal\.\w{2,4}`).MatchString(ret.ProfileImageUrl) {
ret.ProfileImageUrl = strings.ReplaceAll(ret.ProfileImageUrl, "_normal.", ".")
}
2021-06-13 14:34:20 -07:00
ret.BannerImageUrl = apiUser.ProfileBannerURL
ret.ProfileImageLocalPath = ret.compute_profile_image_local_path()
ret.BannerImageLocalPath = ret.compute_banner_image_local_path()
2021-06-13 14:34:20 -07:00
if len(apiUser.PinnedTweetIdsStr) > 0 {
ret.PinnedTweetID = TweetID(idstr_to_int(apiUser.PinnedTweetIdsStr[0]))
2021-06-13 14:34:20 -07:00
}
return
}
2021-06-16 19:31:27 -07:00
// Calls API#GetUser and returns the parsed result
func GetUser(handle UserHandle) (User, error) {
api := API{}
apiUser, err := api.GetUser(handle)
if err != nil {
return User{}, err
}
return ParseSingleUser(apiUser)
}
/**
* Make a filename for the profile image, that hopefully won't clobber other ones
*/
func (u User) compute_profile_image_local_path() string {
return string(u.Handle) + "_profile_" + path.Base(u.ProfileImageUrl)
}
/**
* Make a filename for the banner image, that hopefully won't clobber other ones.
* Add a file extension if necessary (seems to be necessary).
* If there is no banner image, just return nothing.
*/
func (u User) compute_banner_image_local_path() string {
if u.BannerImageUrl == "" {
return ""
}
base_name := path.Base(u.BannerImageUrl)
// Check if it has an extension (e.g., ".png" or ".jpeg")
2021-08-10 22:08:01 -07:00
if !regexp.MustCompile(`\.\w{2,4}$`).MatchString(base_name) {
// If it doesn't have an extension, add one
base_name += ".jpg"
}
return string(u.Handle) + "_banner_" + base_name
}
/**
* Get the URL where we would expect to find a User's tiny profile image
*/
func (u User) GetTinyProfileImageUrl() string {
// Check that the format is as expected
r := regexp.MustCompile(`(\.\w{2,4})$`)
if !r.MatchString(u.ProfileImageUrl) {
panic(fmt.Sprintf("Weird profile image url: %s", u.ProfileImageUrl))
}
return r.ReplaceAllString(u.ProfileImageUrl, "_normal$1")
}
func (u User) GetTinyProfileImageLocalPath() string {
return string(u.Handle) + "_profile_" + path.Base(u.GetTinyProfileImageUrl())
}