@percyliang
Assignment 4 (data): convert Common Crawl HTML to text, filter filter filter (quality, harmful content, PII), deduplication. This is the grunt work that doesn’t get enough appreciation. https://t.co/60V5MB9uv5
Viewing enriched Twitter post
Assignment 4 (data): convert Common Crawl HTML to text, filter filter filter (quality, harmful content, PII), deduplication. This is the grunt work that doesn’t get enough appreciation. https://t.co/60V5MB9uv5
{
"media": [
{
"type": "photo",
"url": "https://crmoxkoizveukayfjuyo.supabase.co/storage/v1/object/public/media/posts/1935458545958535324/media_0.jpg?",
"filename": "media_0.jpg"
}
],
"processed_at": "2025-08-29T21:53:49.634779",
"pipeline_version": "2.0"
} {
"type": "tweet",
"id": "1935458545958535324",
"url": "https://x.com/percyliang/status/1935458545958535324",
"twitterUrl": "https://twitter.com/percyliang/status/1935458545958535324",
"text": "Assignment 4 (data): convert Common Crawl HTML to text, filter filter filter (quality, harmful content, PII), deduplication. This is the grunt work that doesn’t get enough appreciation.\nhttps://t.co/60V5MB9uv5",
"source": "Twitter for iPhone",
"retweetCount": 6,
"replyCount": 2,
"likeCount": 159,
"quoteCount": 0,
"viewCount": 20681,
"createdAt": "Wed Jun 18 22:04:11 +0000 2025",
"lang": "en",
"bookmarkCount": 46,
"isReply": true,
"inReplyToId": "1935458543001649271",
"conversationId": "1935458532037640211",
"displayTextRange": [
0,
209
],
"inReplyToUserId": "86481377",
"inReplyToUsername": "percyliang",
"author": {
"type": "user",
"userName": "percyliang",
"url": "https://x.com/percyliang",
"twitterUrl": "https://twitter.com/percyliang",
"id": "86481377",
"name": "Percy Liang",
"isVerified": false,
"isBlueVerified": true,
"verifiedType": null,
"profilePicture": "https://pbs.twimg.com/profile_images/1319828397699129346/wjBcgUH0_normal.jpg",
"coverPicture": "",
"description": "",
"location": "Stanford, CA",
"followers": 83393,
"following": 417,
"status": "",
"canDm": false,
"canMediaTag": true,
"createdAt": "Sat Oct 31 07:26:37 +0000 2009",
"entities": {
"description": {
"urls": []
},
"url": {}
},
"fastFollowersCount": 0,
"favouritesCount": 2344,
"hasCustomTimelines": true,
"isTranslator": false,
"mediaCount": 103,
"statusesCount": 1183,
"withheldInCountries": [],
"affiliatesHighlightedLabel": {},
"possiblySensitive": false,
"pinnedTweetIds": [
"1924527490351169964"
],
"profile_bio": {
"description": "Associate Professor in computer science @Stanford @StanfordHAI @StanfordCRFM @StanfordAILab @stanfordnlp | cofounder @togethercompute | Pianist",
"entities": {
"description": {
"user_mentions": [
{
"id_str": "0",
"indices": [
40,
49
],
"name": "",
"screen_name": "Stanford"
},
{
"id_str": "0",
"indices": [
50,
62
],
"name": "",
"screen_name": "StanfordHAI"
},
{
"id_str": "0",
"indices": [
63,
76
],
"name": "",
"screen_name": "StanfordCRFM"
},
{
"id_str": "0",
"indices": [
77,
91
],
"name": "",
"screen_name": "StanfordAILab"
},
{
"id_str": "0",
"indices": [
92,
104
],
"name": "",
"screen_name": "stanfordnlp"
},
{
"id_str": "0",
"indices": [
117,
133
],
"name": "",
"screen_name": "togethercompute"
}
]
},
"url": {
"urls": [
{
"display_url": "cs.stanford.edu/~pliang/",
"expanded_url": "https://cs.stanford.edu/~pliang/",
"indices": [
0,
23
],
"url": "https://t.co/eDtsxiBXQg"
}
]
}
}
},
"isAutomated": false,
"automatedBy": null
},
"extendedEntities": {},
"card": {
"binding_values": [
{
"key": "photo_image_full_size_large",
"value": {
"image_value": {
"height": 419,
"url": "https://pbs.twimg.com/card_img/1960861345001431040/kOcXxQJy?format=jpg&name=800x419",
"width": 800
}
}
},
{
"key": "thumbnail_image",
"value": {
"image_value": {
"height": 200,
"url": "https://pbs.twimg.com/card_img/1960861345001431040/kOcXxQJy?format=jpg&name=400x400",
"width": 400
}
}
},
{
"key": "description",
"value": {
"string_value": "Contribute to stanford-cs336/assignment4-data development by creating an account on GitHub."
}
},
{
"key": "domain",
"value": {
"string_value": "github.com"
}
},
{
"key": "thumbnail_image_large",
"value": {
"image_value": {
"height": 300,
"url": "https://pbs.twimg.com/card_img/1960861345001431040/kOcXxQJy?format=jpg&name=600x600",
"width": 600
}
}
},
{
"key": "summary_photo_image_small",
"value": {
"image_value": {
"height": 202,
"url": "https://pbs.twimg.com/card_img/1960861345001431040/kOcXxQJy?format=jpg&name=386x202",
"width": 386
}
}
},
{
"key": "thumbnail_image_original",
"value": {
"image_value": {
"height": 600,
"url": "https://pbs.twimg.com/card_img/1960861345001431040/kOcXxQJy?format=jpg&name=orig",
"width": 1200
}
}
},
{
"key": "site",
"value": {
"scribe_key": "publisher_id",
"user_value": {
"id_str": "13334762"
}
}
},
{
"key": "photo_image_full_size_small",
"value": {
"image_value": {
"height": 202,
"url": "https://pbs.twimg.com/card_img/1960861345001431040/kOcXxQJy?format=jpg&name=386x202",
"width": 386
}
}
},
{
"key": "summary_photo_image_large",
"value": {
"image_value": {
"height": 419,
"url": "https://pbs.twimg.com/card_img/1960861345001431040/kOcXxQJy?format=jpg&name=800x419",
"width": 800
}
}
},
{
"key": "thumbnail_image_small",
"value": {
"image_value": {
"height": 72,
"url": "https://pbs.twimg.com/card_img/1960861345001431040/kOcXxQJy?format=jpg&name=144x144",
"width": 144
}
}
},
{
"key": "thumbnail_image_x_large",
"value": {
"image_value": {
"height": 600,
"url": "https://pbs.twimg.com/card_img/1960861345001431040/kOcXxQJy?format=png&name=2048x2048_2_exp",
"width": 1200
}
}
},
{
"key": "photo_image_full_size_original",
"value": {
"image_value": {
"height": 600,
"url": "https://pbs.twimg.com/card_img/1960861345001431040/kOcXxQJy?format=jpg&name=orig",
"width": 1200
}
}
},
{
"key": "photo_image_full_size_alt_text",
"value": {
"string_value": "Contribute to stanford-cs336/assignment4-data development by creating an account on GitHub."
}
},
{
"key": "vanity_url",
"value": {
"scribe_key": "vanity_url",
"string_value": "github.com"
}
},
{
"key": "photo_image_full_size",
"value": {
"image_value": {
"height": 314,
"url": "https://pbs.twimg.com/card_img/1960861345001431040/kOcXxQJy?format=jpg&name=600x314",
"width": 600
}
}
},
{
"key": "summary_photo_image_alt_text",
"value": {
"string_value": "Contribute to stanford-cs336/assignment4-data development by creating an account on GitHub."
}
},
{
"key": "thumbnail_image_color",
"value": {
"image_color_value": {
"palette": [
{
"percentage": 93.08,
"rgb": {
"blue": 255,
"green": 255,
"red": 255
}
},
{
"percentage": 3.76,
"rgb": {
"blue": 166,
"green": 115,
"red": 53
}
},
{
"percentage": 1.51,
"rgb": {
"blue": 126,
"green": 119,
"red": 116
}
},
{
"percentage": 1.46,
"rgb": {
"blue": 226,
"green": 146,
"red": 145
}
}
]
}
}
},
{
"key": "title",
"value": {
"string_value": "assignment4-data/cs336_spring2025_assignment4_data.pdf at main · stanford-cs336/assignment4-data"
}
},
{
"key": "summary_photo_image_color",
"value": {
"image_color_value": {
"palette": [
{
"percentage": 93.08,
"rgb": {
"blue": 255,
"green": 255,
"red": 255
}
},
{
"percentage": 3.76,
"rgb": {
"blue": 166,
"green": 115,
"red": 53
}
},
{
"percentage": 1.51,
"rgb": {
"blue": 126,
"green": 119,
"red": 116
}
},
{
"percentage": 1.46,
"rgb": {
"blue": 226,
"green": 146,
"red": 145
}
}
]
}
}
},
{
"key": "summary_photo_image_x_large",
"value": {
"image_value": {
"height": 600,
"url": "https://pbs.twimg.com/card_img/1960861345001431040/kOcXxQJy?format=png&name=2048x2048_2_exp",
"width": 1200
}
}
},
{
"key": "summary_photo_image",
"value": {
"image_value": {
"height": 314,
"url": "https://pbs.twimg.com/card_img/1960861345001431040/kOcXxQJy?format=jpg&name=600x314",
"width": 600
}
}
},
{
"key": "photo_image_full_size_color",
"value": {
"image_color_value": {
"palette": [
{
"percentage": 93.08,
"rgb": {
"blue": 255,
"green": 255,
"red": 255
}
},
{
"percentage": 3.76,
"rgb": {
"blue": 166,
"green": 115,
"red": 53
}
},
{
"percentage": 1.51,
"rgb": {
"blue": 126,
"green": 119,
"red": 116
}
},
{
"percentage": 1.46,
"rgb": {
"blue": 226,
"green": 146,
"red": 145
}
}
]
}
}
},
{
"key": "photo_image_full_size_x_large",
"value": {
"image_value": {
"height": 600,
"url": "https://pbs.twimg.com/card_img/1960861345001431040/kOcXxQJy?format=png&name=2048x2048_2_exp",
"width": 1200
}
}
},
{
"key": "card_url",
"value": {
"scribe_key": "card_url",
"string_value": "https://t.co/60V5MB9uv5"
}
},
{
"key": "summary_photo_image_original",
"value": {
"image_value": {
"height": 600,
"url": "https://pbs.twimg.com/card_img/1960861345001431040/kOcXxQJy?format=jpg&name=orig",
"width": 1200
}
}
}
],
"card_platform": {
"platform": {
"audience": {
"name": "production"
},
"device": {
"name": "iPhone",
"version": "13"
}
}
},
"name": "summary_large_image",
"url": "https://t.co/60V5MB9uv5",
"user_refs_results": [
{
"rest_id": "13334762",
"result": {
"__typename": "User",
"action_counts": {
"favorites_count": 8199
},
"avatar": {
"image_url": "https://pbs.twimg.com/profile_images/1633247750010830848/8zfRrYjA_normal.png"
},
"banner": {
"image_url": "https://pbs.twimg.com/profile_banners/13334762/1747774520"
},
"core": {
"created_at": "Mon Feb 11 04:41:50 +0000 2008",
"name": "GitHub",
"screen_name": "github"
},
"dm_permissions": {
"can_dm": false
},
"exclusive_tweet_following": false,
"identity_profile_labels_highlighted_label": {},
"location": {
"location": "San Francisco, CA"
},
"media_permissions": {
"can_media_tag": true
},
"notifications_settings": {},
"pinned_items": {},
"privacy": {},
"private_super_following": false,
"profile_bio": {
"description": "The AI-powered developer platform to build, scale, and deliver secure software.",
"entities": {
"description": {},
"url": {
"urls": [
{
"display_url": "github.com",
"expanded_url": "http://github.com",
"indices": [
0,
23
],
"url": "https://t.co/bbJgfyzcJR"
}
]
}
}
},
"profile_image_shape": "Square",
"profile_metadata": {
"profile_interstitial_type": "",
"profile_link_color": "981CEB"
},
"profile_translation": {
"translator_type_enum": "None"
},
"properties": {
"has_extended_profile": true
},
"relationship_counts": {
"followers": 2641543,
"following": 327
},
"relationship_perspectives": {},
"rest_id": "13334762",
"super_follow_eligible": false,
"super_followed_by": false,
"super_following": false,
"tweet_counts": {
"media_tweets": 2732,
"tweets": 9885
},
"website": {
"url": "https://t.co/bbJgfyzcJR"
},
"verification": {
"is_blue_verified": true,
"verified_type": "Business"
}
}
}
]
},
"place": {},
"entities": {
"urls": [
{
"display_url": "github.com/stanford-cs336…",
"expanded_url": "https://github.com/stanford-cs336/assignment4-data/blob/main/cs336_spring2025_assignment4_data.pdf",
"indices": [
186,
209
],
"url": "https://t.co/60V5MB9uv5"
}
]
},
"quoted_tweet": null,
"retweeted_tweet": null,
"isLimitedReply": false,
"article": null
}