CJ_Clippy 2023c98973 git subrepo clone https://github.com/vladkens/twscrape ./apps/twscrape
subrepo:
  subdir:   "apps/twscrape"
  merged:   "c92e7883"
upstream:
  origin:   "https://github.com/vladkens/twscrape"
  branch:   "main"
  commit:   "c92e7883"
git-subrepo:
  version:  "0.4.9"
  origin:   "???"
  commit:   "???"
2025-03-11 08:51:36 -08:00

515 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from contextlib import aclosing
from typing import Literal
from httpx import Response
from .accounts_pool import AccountsPool
from .logger import set_log_level
from .models import Tweet, User, parse_trends, parse_tweet, parse_tweets, parse_user, parse_users
from .queue_client import QueueClient
from .utils import encode_params, find_obj, get_by_path
# OP_{NAME} {NAME} should be same as second part of GQL ID (required to auto-update script)
OP_SearchTimeline = "U3QTLwGF8sZCHDuWIMSAmg/SearchTimeline"
OP_UserByRestId = "5vdJ5sWkbSRDiiNZvwc2Yg/UserByRestId"
OP_UserByScreenName = "32pL5BWe9WKeSK1MoPvFQQ/UserByScreenName"
OP_TweetDetail = "Ez6kRPyXbqNlhBwcNMpU-Q/TweetDetail"
OP_Followers = "OGScL-RC4DFMsRGOCjPR6g/Followers"
OP_Following = "o5eNLkJb03ayTQa97Cpp7w/Following"
OP_Retweeters = "niCJ2QyTuAgZWv01E7mqJQ/Retweeters"
OP_UserTweets = "Y9WM4Id6UcGFE8Z-hbnixw/UserTweets"
OP_UserTweetsAndReplies = "pZXwh96YGRqmBbbxu7Vk2Q/UserTweetsAndReplies"
OP_ListLatestTweetsTimeline = "H_dAKg97dSn3FOMfrNS8nw/ListLatestTweetsTimeline"
OP_BlueVerifiedFollowers = "WijS8Cwfqhtk5hDN9q7sgw/BlueVerifiedFollowers"
OP_UserCreatorSubscriptions = "H4p-DZU4gYqcZulycazCZw/UserCreatorSubscriptions"
OP_UserMedia = "ophTtKkfXcUKnXlxh9fU5w/UserMedia"
OP_Bookmarks = "1vFR5f4iSCQZLzjdSsNYwA/Bookmarks"
OP_GenericTimelineById = "5u36Lskx1dfACjC_WHmH3Q/GenericTimelineById"
GQL_URL = "https://x.com/i/api/graphql"
GQL_FEATURES = { # search values here (view source) https://x.com/
"articles_preview_enabled": False,
"c9s_tweet_anatomy_moderator_badge_enabled": True,
"communities_web_enable_tweet_community_results_fetch": True,
"creator_subscriptions_quote_tweet_preview_enabled": False,
"creator_subscriptions_tweet_preview_api_enabled": True,
"freedom_of_speech_not_reach_fetch_enabled": True,
"graphql_is_translatable_rweb_tweet_is_translatable_enabled": True,
"longform_notetweets_consumption_enabled": True,
"longform_notetweets_inline_media_enabled": True,
"longform_notetweets_rich_text_read_enabled": True,
"responsive_web_edit_tweet_api_enabled": True,
"responsive_web_enhance_cards_enabled": False,
"responsive_web_graphql_exclude_directive_enabled": True,
"responsive_web_graphql_skip_user_profile_image_extensions_enabled": False,
"responsive_web_graphql_timeline_navigation_enabled": True,
"responsive_web_media_download_video_enabled": False,
"responsive_web_twitter_article_tweet_consumption_enabled": True,
"rweb_tipjar_consumption_enabled": True,
"rweb_video_timestamps_enabled": True,
"standardized_nudges_misinfo": True,
"tweet_awards_web_tipping_enabled": False,
"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": True,
"tweet_with_visibility_results_prefer_gql_media_interstitial_enabled": False,
"tweetypie_unmention_optimization_enabled": True,
"verified_phone_label_enabled": False,
"view_counts_everywhere_api_enabled": True,
"responsive_web_grok_analyze_button_fetch_trends_enabled": False,
"premium_content_api_read_enabled": False,
"profile_label_improvements_pcf_label_in_post_enabled": False,
"responsive_web_grok_share_attachment_enabled": False,
"responsive_web_grok_analyze_post_followups_enabled": False,
"responsive_web_grok_image_annotation_enabled": False,
"responsive_web_grok_analysis_button_from_backend": False,
"responsive_web_jetfuel_frame": False,
}
KV = dict | None
TrendId = Literal["trending", "news", "sport", "entertainment"] | str
class API:
# Note: kv is variables, ft is features from original GQL request
pool: AccountsPool
def __init__(
self,
pool: AccountsPool | str | None = None,
debug=False,
proxy: str | None = None,
raise_when_no_account=False,
):
if isinstance(pool, AccountsPool):
self.pool = pool
elif isinstance(pool, str):
self.pool = AccountsPool(db_file=pool, raise_when_no_account=raise_when_no_account)
else:
self.pool = AccountsPool(raise_when_no_account=raise_when_no_account)
self.proxy = proxy
self.debug = debug
if self.debug:
set_log_level("DEBUG")
# general helpers
def _is_end(self, rep: Response, q: str, res: list, cur: str | None, cnt: int, lim: int):
new_count = len(res)
new_total = cnt + new_count
is_res = new_count > 0
is_cur = cur is not None
is_lim = lim > 0 and new_total >= lim
return rep if is_res else None, new_total, is_cur and not is_lim
def _get_cursor(self, obj: dict, cursor_type="Bottom") -> str | None:
if cur := find_obj(obj, lambda x: x.get("cursorType") == cursor_type):
return cur.get("value")
return None
# gql helpers
async def _gql_items(
self, op: str, kv: dict, ft: dict | None = None, limit=-1, cursor_type="Bottom"
):
queue, cur, cnt, active = op.split("/")[-1], None, 0, True
kv, ft = {**kv}, {**GQL_FEATURES, **(ft or {})}
async with QueueClient(self.pool, queue, self.debug, proxy=self.proxy) as client:
while active:
params = {"variables": kv, "features": ft}
if cur is not None:
params["variables"]["cursor"] = cur
if queue in ("SearchTimeline", "ListLatestTweetsTimeline"):
params["fieldToggles"] = {"withArticleRichContentState": False}
if queue in ("UserMedia",):
params["fieldToggles"] = {"withArticlePlainText": False}
rep = await client.get(f"{GQL_URL}/{op}", params=encode_params(params))
if rep is None:
return
obj = rep.json()
els = get_by_path(obj, "entries") or []
els = [
x
for x in els
if not (
x["entryId"].startswith("cursor-")
or x["entryId"].startswith("messageprompt-")
)
]
cur = self._get_cursor(obj, cursor_type)
rep, cnt, active = self._is_end(rep, queue, els, cur, cnt, limit)
if rep is None:
return
yield rep
async def _gql_item(self, op: str, kv: dict, ft: dict | None = None):
ft = ft or {}
queue = op.split("/")[-1]
async with QueueClient(self.pool, queue, self.debug, proxy=self.proxy) as client:
params = {"variables": {**kv}, "features": {**GQL_FEATURES, **ft}}
return await client.get(f"{GQL_URL}/{op}", params=encode_params(params))
# search
async def search_raw(self, q: str, limit=-1, kv: KV = None):
op = OP_SearchTimeline
kv = {
"rawQuery": q,
"count": 20,
"product": "Latest",
"querySource": "typed_query",
**(kv or {}),
}
async with aclosing(self._gql_items(op, kv, limit=limit)) as gen:
async for x in gen:
yield x
async def search(self, q: str, limit=-1, kv: KV = None):
async with aclosing(self.search_raw(q, limit=limit, kv=kv)) as gen:
async for rep in gen:
for x in parse_tweets(rep.json(), limit):
yield x
async def search_user(self, q: str, limit=-1, kv: KV = None):
kv = {"product": "People", **(kv or {})}
async with aclosing(self.search_raw(q, limit=limit, kv=kv)) as gen:
async for rep in gen:
for x in parse_users(rep.json(), limit):
yield x
# user_by_id
async def user_by_id_raw(self, uid: int, kv: KV = None):
op = OP_UserByRestId
kv = {"userId": str(uid), "withSafetyModeUserFields": True, **(kv or {})}
ft = {
"hidden_profile_likes_enabled": True,
"highlights_tweets_tab_ui_enabled": True,
"creator_subscriptions_tweet_preview_api_enabled": True,
"hidden_profile_subscriptions_enabled": True,
"responsive_web_twitter_article_notes_tab_enabled": False,
"subscriptions_feature_can_gift_premium": False,
"profile_label_improvements_pcf_label_in_post_enabled": False,
}
return await self._gql_item(op, kv, ft)
async def user_by_id(self, uid: int, kv: KV = None) -> User | None:
rep = await self.user_by_id_raw(uid, kv=kv)
return parse_user(rep) if rep else None
# user_by_login
async def user_by_login_raw(self, login: str, kv: KV = None):
op = OP_UserByScreenName
kv = {"screen_name": login, "withSafetyModeUserFields": True, **(kv or {})}
ft = {
"highlights_tweets_tab_ui_enabled": True,
"hidden_profile_likes_enabled": True,
"creator_subscriptions_tweet_preview_api_enabled": True,
"hidden_profile_subscriptions_enabled": True,
"subscriptions_verification_info_verified_since_enabled": True,
"subscriptions_verification_info_is_identity_verified_enabled": False,
"responsive_web_twitter_article_notes_tab_enabled": False,
"subscriptions_feature_can_gift_premium": False,
"profile_label_improvements_pcf_label_in_post_enabled": False,
}
return await self._gql_item(op, kv, ft)
async def user_by_login(self, login: str, kv: KV = None) -> User | None:
rep = await self.user_by_login_raw(login, kv=kv)
return parse_user(rep) if rep else None
# tweet_details
async def tweet_details_raw(self, twid: int, kv: KV = None):
op = OP_TweetDetail
kv = {
"focalTweetId": str(twid),
"with_rux_injections": True,
"includePromotedContent": True,
"withCommunity": True,
"withQuickPromoteEligibilityTweetFields": True,
"withBirdwatchNotes": True,
"withVoice": True,
"withV2Timeline": True,
**(kv or {}),
}
return await self._gql_item(op, kv)
async def tweet_details(self, twid: int, kv: KV = None) -> Tweet | None:
rep = await self.tweet_details_raw(twid, kv=kv)
return parse_tweet(rep, twid) if rep else None
# tweet_replies
# note: uses same op as tweet_details, see: https://github.com/vladkens/twscrape/issues/104
async def tweet_replies_raw(self, twid: int, limit=-1, kv: KV = None):
op = OP_TweetDetail
kv = {
"focalTweetId": str(twid),
"referrer": "tweet",
"with_rux_injections": True,
"includePromotedContent": True,
"withCommunity": True,
"withQuickPromoteEligibilityTweetFields": True,
"withBirdwatchNotes": True,
"withVoice": True,
"withV2Timeline": True,
**(kv or {}),
}
async with aclosing(
self._gql_items(op, kv, limit=limit, cursor_type="ShowMoreThreads")
) as gen:
async for x in gen:
yield x
async def tweet_replies(self, twid: int, limit=-1, kv: KV = None):
async with aclosing(self.tweet_replies_raw(twid, limit=limit, kv=kv)) as gen:
async for rep in gen:
for x in parse_tweets(rep.json(), limit):
if x.inReplyToTweetId == twid:
yield x
# followers
async def followers_raw(self, uid: int, limit=-1, kv: KV = None):
op = OP_Followers
kv = {"userId": str(uid), "count": 20, "includePromotedContent": False, **(kv or {})}
ft = {"responsive_web_twitter_article_notes_tab_enabled": False}
async with aclosing(self._gql_items(op, kv, limit=limit, ft=ft)) as gen:
async for x in gen:
yield x
async def followers(self, uid: int, limit=-1, kv: KV = None):
async with aclosing(self.followers_raw(uid, limit=limit, kv=kv)) as gen:
async for rep in gen:
for x in parse_users(rep.json(), limit):
yield x
# verified_followers
async def verified_followers_raw(self, uid: int, limit=-1, kv: KV = None):
op = OP_BlueVerifiedFollowers
kv = {"userId": str(uid), "count": 20, "includePromotedContent": False, **(kv or {})}
ft = {
"responsive_web_twitter_article_notes_tab_enabled": True,
}
async with aclosing(self._gql_items(op, kv, limit=limit, ft=ft)) as gen:
async for x in gen:
yield x
async def verified_followers(self, uid: int, limit=-1, kv: KV = None):
async with aclosing(self.verified_followers_raw(uid, limit=limit, kv=kv)) as gen:
async for rep in gen:
for x in parse_users(rep.json(), limit):
yield x
# following
async def following_raw(self, uid: int, limit=-1, kv: KV = None):
op = OP_Following
kv = {"userId": str(uid), "count": 20, "includePromotedContent": False, **(kv or {})}
async with aclosing(self._gql_items(op, kv, limit=limit)) as gen:
async for x in gen:
yield x
async def following(self, uid: int, limit=-1, kv: KV = None):
async with aclosing(self.following_raw(uid, limit=limit, kv=kv)) as gen:
async for rep in gen:
for x in parse_users(rep.json(), limit):
yield x
# subscriptions
async def subscriptions_raw(self, uid: int, limit=-1, kv: KV = None):
op = OP_UserCreatorSubscriptions
kv = {"userId": str(uid), "count": 20, "includePromotedContent": False, **(kv or {})}
async with aclosing(self._gql_items(op, kv, limit=limit)) as gen:
async for x in gen:
yield x
async def subscriptions(self, uid: int, limit=-1, kv: KV = None):
async with aclosing(self.subscriptions_raw(uid, limit=limit, kv=kv)) as gen:
async for rep in gen:
for x in parse_users(rep.json(), limit):
yield x
# retweeters
async def retweeters_raw(self, twid: int, limit=-1, kv: KV = None):
op = OP_Retweeters
kv = {"tweetId": str(twid), "count": 20, "includePromotedContent": True, **(kv or {})}
async with aclosing(self._gql_items(op, kv, limit=limit)) as gen:
async for x in gen:
yield x
async def retweeters(self, twid: int, limit=-1, kv: KV = None):
async with aclosing(self.retweeters_raw(twid, limit=limit, kv=kv)) as gen:
async for rep in gen:
for x in parse_users(rep.json(), limit):
yield x
# user_tweets
async def user_tweets_raw(self, uid: int, limit=-1, kv: KV = None):
op = OP_UserTweets
kv = {
"userId": str(uid),
"count": 40,
"includePromotedContent": True,
"withQuickPromoteEligibilityTweetFields": True,
"withVoice": True,
"withV2Timeline": True,
**(kv or {}),
}
async with aclosing(self._gql_items(op, kv, limit=limit)) as gen:
async for x in gen:
yield x
async def user_tweets(self, uid: int, limit=-1, kv: KV = None):
async with aclosing(self.user_tweets_raw(uid, limit=limit, kv=kv)) as gen:
async for rep in gen:
for x in parse_tweets(rep.json(), limit):
yield x
# user_tweets_and_replies
async def user_tweets_and_replies_raw(self, uid: int, limit=-1, kv: KV = None):
op = OP_UserTweetsAndReplies
kv = {
"userId": str(uid),
"count": 40,
"includePromotedContent": True,
"withCommunity": True,
"withVoice": True,
"withV2Timeline": True,
**(kv or {}),
}
async with aclosing(self._gql_items(op, kv, limit=limit)) as gen:
async for x in gen:
yield x
async def user_tweets_and_replies(self, uid: int, limit=-1, kv: KV = None):
async with aclosing(self.user_tweets_and_replies_raw(uid, limit=limit, kv=kv)) as gen:
async for rep in gen:
for x in parse_tweets(rep.json(), limit):
yield x
# user_media
async def user_media_raw(self, uid: int, limit=-1, kv: KV = None):
op = OP_UserMedia
kv = {
"userId": str(uid),
"count": 40,
"includePromotedContent": False,
"withClientEventToken": False,
"withBirdwatchNotes": False,
"withVoice": True,
"withV2Timeline": True,
**(kv or {}),
}
async with aclosing(self._gql_items(op, kv, limit=limit)) as gen:
async for x in gen:
yield x
async def user_media(self, uid: int, limit=-1, kv: KV = None):
async with aclosing(self.user_media_raw(uid, limit=limit, kv=kv)) as gen:
async for rep in gen:
for x in parse_tweets(rep, limit):
# sometimes some tweets without media, so skip them
media_count = (
len(x.media.photos) + len(x.media.videos) + len(x.media.animated)
if x.media
else 0
)
if media_count > 0:
yield x
# list_timeline
async def list_timeline_raw(self, list_id: int, limit=-1, kv: KV = None):
op = OP_ListLatestTweetsTimeline
kv = {"listId": str(list_id), "count": 20, **(kv or {})}
async with aclosing(self._gql_items(op, kv, limit=limit)) as gen:
async for x in gen:
yield x
async def list_timeline(self, list_id: int, limit=-1, kv: KV = None):
async with aclosing(self.list_timeline_raw(list_id, limit=limit, kv=kv)) as gen:
async for rep in gen:
for x in parse_tweets(rep, limit):
yield x
# trends
async def trends_raw(self, trend_id: TrendId, limit=-1, kv: KV = None):
map = {
"trending": "VGltZWxpbmU6DAC2CwABAAAACHRyZW5kaW5nAAA",
"news": "VGltZWxpbmU6DAC2CwABAAAABG5ld3MAAA",
"sport": "VGltZWxpbmU6DAC2CwABAAAABnNwb3J0cwAA",
"entertainment": "VGltZWxpbmU6DAC2CwABAAAADWVudGVydGFpbm1lbnQAAA",
}
trend_id = map.get(trend_id, trend_id)
op = OP_GenericTimelineById
kv = {
"timelineId": trend_id,
"count": 20,
"withQuickPromoteEligibilityTweetFields": True,
**(kv or {}),
}
async with aclosing(self._gql_items(op, kv, limit=limit)) as gen:
async for x in gen:
yield x
async def trends(self, trend_id: TrendId, limit=-1, kv: KV = None):
async with aclosing(self.trends_raw(trend_id, limit=limit, kv=kv)) as gen:
async for rep in gen:
for x in parse_trends(rep, limit):
yield x
async def search_trend(self, q: str, limit=-1, kv: KV = None):
kv = {
"querySource": "trend_click",
**(kv or {}),
}
async with aclosing(self.search_raw(q, limit=limit, kv=kv)) as gen:
async for rep in gen:
for x in parse_tweets(rep.json(), limit):
yield x
# Get current user bookmarks
async def bookmarks_raw(self, limit=-1, kv: KV = None):
op = OP_Bookmarks
kv = {
"count": 20,
"includePromotedContent": False,
"withClientEventToken": False,
"withBirdwatchNotes": False,
"withVoice": True,
"withV2Timeline": True,
**(kv or {}),
}
ft = {
"graphql_timeline_v2_bookmark_timeline": True,
}
async with aclosing(self._gql_items(op, kv, ft, limit=limit)) as gen:
async for x in gen:
yield x
async def bookmarks(self, limit=-1, kv: KV = None):
async with aclosing(self.bookmarks_raw(limit=limit, kv=kv)) as gen:
async for rep in gen:
for x in parse_tweets(rep.json(), limit):
yield x