@Xianbao_QIAN
opendatalab/AICC: Markdown version of Common Crawl, extracted by MinerU. Very cool. It only has two shards for now but someone could scale it up to the entire Common Crawl. https://t.co/bH8m8rqbuH
Viewing enriched Twitter post
opendatalab/AICC: Markdown version of Common Crawl, extracted by MinerU. Very cool. It only has two shards for now but someone could scale it up to the entire Common Crawl. https://t.co/bH8m8rqbuH
{
"media": [
{
"url": "https://crmoxkoizveukayfjuyo.supabase.co/storage/v1/object/public/media/posts/1994739386731434362/media_0.jpg?",
"media_url": "https://crmoxkoizveukayfjuyo.supabase.co/storage/v1/object/public/media/posts/1994739386731434362/media_0.jpg?",
"type": "photo",
"filename": "media_0.jpg"
}
],
"processed_at": "2025-11-29T15:59:55.436793",
"pipeline_version": "2.0"
} {
"type": "tweet",
"id": "1994739386731434362",
"url": "https://x.com/Xianbao_QIAN/status/1994739386731434362",
"twitterUrl": "https://twitter.com/Xianbao_QIAN/status/1994739386731434362",
"text": "opendatalab/AICC: Markdown version of Common Crawl, extracted by MinerU.\n\nVery cool. \n\nIt only has two shards for now but someone could scale it up to the entire Common Crawl. https://t.co/bH8m8rqbuH",
"source": "Twitter for iPhone",
"retweetCount": 5,
"replyCount": 1,
"likeCount": 29,
"quoteCount": 0,
"viewCount": 7363,
"createdAt": "Sat Nov 29 12:05:04 +0000 2025",
"lang": "en",
"bookmarkCount": 8,
"isReply": false,
"inReplyToId": null,
"conversationId": "1994739386731434362",
"displayTextRange": [
0,
175
],
"inReplyToUserId": null,
"inReplyToUsername": null,
"author": {
"type": "user",
"userName": "Xianbao_QIAN",
"url": "https://x.com/Xianbao_QIAN",
"twitterUrl": "https://twitter.com/Xianbao_QIAN",
"id": "1597257798068637697",
"name": "Tiezhen WANG",
"isVerified": false,
"isBlueVerified": true,
"verifiedType": null,
"profilePicture": "https://pbs.twimg.com/profile_images/1608827777175912449/3vaCvfND_normal.jpg",
"coverPicture": "",
"description": "",
"location": "",
"followers": 8393,
"following": 2083,
"status": "",
"canDm": false,
"canMediaTag": true,
"createdAt": "Mon Nov 28 15:55:26 +0000 2022",
"entities": {
"description": {
"urls": []
},
"url": {}
},
"fastFollowersCount": 0,
"favouritesCount": 6845,
"hasCustomTimelines": true,
"isTranslator": false,
"mediaCount": 718,
"statusesCount": 2751,
"withheldInCountries": [],
"affiliatesHighlightedLabel": {},
"possiblySensitive": false,
"pinnedTweetIds": [
"1970430148253229355"
],
"profile_bio": {
"description": "Head of APAC ecosystem @huggingface, interested in future tech. Ex-Googler on TFLite / micro. Ideas are my own. DM me to talk open source and robotics in APAC.",
"entities": {
"description": {
"user_mentions": [
{
"id_str": "0",
"indices": [
23,
35
],
"name": "",
"screen_name": "huggingface"
}
]
}
}
},
"isAutomated": false,
"automatedBy": null
},
"extendedEntities": {
"media": [
{
"allow_download_status": {
"allow_download": true
},
"display_url": "pic.twitter.com/bH8m8rqbuH",
"expanded_url": "https://twitter.com/Xianbao_QIAN/status/1994739386731434362/photo/1",
"ext_media_availability": {
"status": "Available"
},
"features": {
"large": {
"faces": [
{
"h": 81,
"w": 81,
"x": 790,
"y": 178
},
{
"h": 155,
"w": 155,
"x": 929,
"y": 466
}
]
},
"orig": {
"faces": [
{
"h": 81,
"w": 81,
"x": 790,
"y": 178
},
{
"h": 155,
"w": 155,
"x": 929,
"y": 466
}
]
}
},
"id_str": "1994738824996687880",
"indices": [
176,
199
],
"media_key": "3_1994738824996687880",
"media_results": {
"id": "QXBpTWVkaWFSZXN1bHRzOgwAAQoAARuuvGSLm5AICgACG66851WbkXoAAA==",
"result": {
"__typename": "ApiMedia",
"id": "QXBpTWVkaWE6DAABCgABG668ZIubkAgKAAIbrrznVZuRegAA",
"media_key": "3_1994738824996687880"
}
},
"media_url_https": "https://pbs.twimg.com/media/G668ZIubkAg5_qq.jpg",
"original_info": {
"focus_rects": [
{
"h": 709,
"w": 1266,
"x": 24,
"y": 0
},
{
"h": 709,
"w": 709,
"x": 303,
"y": 0
},
{
"h": 709,
"w": 622,
"x": 346,
"y": 0
},
{
"h": 709,
"w": 355,
"x": 480,
"y": 0
},
{
"h": 709,
"w": 1386,
"x": 0,
"y": 0
}
],
"height": 709,
"width": 1386
},
"sizes": {
"large": {
"h": 709,
"w": 1386
}
},
"type": "photo",
"url": "https://t.co/bH8m8rqbuH"
}
]
},
"card": null,
"place": {},
"entities": {},
"quoted_tweet": null,
"retweeted_tweet": null,
"isLimitedReply": false,
"article": null
}