🐦 Twitter Post Details

Viewing enriched Twitter post

@jerryjliu0

The fundamental issue with PDF parsing is that PDFs are designed for display purposes. The internal representation of data is outputting shapes at specific coordinates on the page (e.g. "render this string at coordinate (84, 720) with this font") each displayed character could be not contiguous at all, there could be no font mapping back to unicode so you have no idea what the character is. Any PDF parser needs to magically reconstruct this random sequence of display coordinate data into semantically meaningful text, tables, and more. VLMs do help (screenshot the page and read it), but besides collapsing the metadata they still struggle in terms of accuracy and cost. note: parsing Word/Pptx as text representations so typically a bit easier too read. Our entire company at @llama_index is laser-focused on PDF parsing so we've been really trying to understand all the nuances of doc formats, especially PDFs 🙂 more notes on this coming soon

View on Twitter

📊 Media Metadata

{
  "score": 0.38,
  "score_components": {
    "author": 0.09,
    "engagement": 0.0,
    "quality": 0.08000000000000002,
    "source": 0.135,
    "nlp": 0.05,
    "recency": 0.025
  },
  "scored_at": "2026-03-02T12:00:38.582923",
  "import_source": "api_import",
  "source_tagged_at": "2026-03-02T12:00:38.582940",
  "enriched": true,
  "enriched_at": "2026-03-02T12:00:38.582942"
}

🔧 Raw API Response

{
  "type": "tweet",
  "id": "2028505461717356919",
  "url": "https://x.com/jerryjliu0/status/2028505461717356919",
  "twitterUrl": "https://twitter.com/jerryjliu0/status/2028505461717356919",
  "text": "The fundamental issue with PDF parsing is that PDFs are designed for display purposes. The internal representation of data is outputting shapes at specific coordinates on the page (e.g. \"render this string at coordinate (84, 720) with this font\")\n\neach displayed character could be not contiguous at all, there could be no font mapping back to unicode so you have no idea what the character is. Any PDF parser needs to magically reconstruct this random sequence of display coordinate data into semantically meaningful text, tables, and more. \n\nVLMs do help (screenshot the page and read it), but besides collapsing the metadata they still struggle in terms of accuracy and cost.\n\nnote: parsing Word/Pptx as text representations so typically a bit easier too read. \n\nOur entire company at @llama_index is laser-focused on PDF parsing so we've been really trying to understand all the nuances of doc formats, especially PDFs 🙂 more notes on this coming soon",
  "source": "Twitter for iPhone",
  "retweetCount": 13,
  "replyCount": 14,
  "likeCount": 157,
  "quoteCount": 0,
  "viewCount": 19466,
  "createdAt": "Mon Mar 02 16:19:24 +0000 2026",
  "lang": "en",
  "bookmarkCount": 149,
  "isReply": false,
  "inReplyToId": null,
  "conversationId": "2028505461717356919",
  "displayTextRange": [
    0,
    278
  ],
  "inReplyToUserId": null,
  "inReplyToUsername": null,
  "author": {
    "type": "user",
    "userName": "jerryjliu0",
    "url": "https://x.com/jerryjliu0",
    "twitterUrl": "https://twitter.com/jerryjliu0",
    "id": "369777416",
    "name": "Jerry Liu",
    "isVerified": false,
    "isBlueVerified": true,
    "verifiedType": null,
    "profilePicture": "https://pbs.twimg.com/profile_images/1283610285031460864/1Q4zYhtb_normal.jpg",
    "coverPicture": "",
    "description": "",
    "location": "",
    "followers": 70122,
    "following": 1461,
    "status": "",
    "canDm": true,
    "canMediaTag": true,
    "createdAt": "Wed Sep 07 22:54:31 +0000 2011",
    "entities": {
      "description": {
        "urls": []
      },
      "url": {}
    },
    "fastFollowersCount": 0,
    "favouritesCount": 8427,
    "hasCustomTimelines": true,
    "isTranslator": false,
    "mediaCount": 1429,
    "statusesCount": 6647,
    "withheldInCountries": [],
    "affiliatesHighlightedLabel": {},
    "possiblySensitive": false,
    "pinnedTweetIds": [
      "2001777385193271443"
    ],
    "profile_bio": {
      "description": "document OCR + workflows @llama_index. cofounder/CEO\n\nCareers: https://t.co/EUnMNmb4DZ\nEnterprise: https://t.co/Ht5jwxRU13",
      "entities": {
        "description": {
          "hashtags": [],
          "symbols": [],
          "urls": [
            {
              "display_url": "llamaindex.ai/careers",
              "expanded_url": "https://www.llamaindex.ai/careers",
              "indices": [
                63,
                86
              ],
              "url": "https://t.co/EUnMNmb4DZ"
            },
            {
              "display_url": "llamaindex.ai/contact",
              "expanded_url": "https://www.llamaindex.ai/contact",
              "indices": [
                99,
                122
              ],
              "url": "https://t.co/Ht5jwxRU13"
            }
          ],
          "user_mentions": [
            {
              "id_str": "0",
              "indices": [
                25,
                37
              ],
              "name": "",
              "screen_name": "llama_index"
            }
          ]
        },
        "url": {
          "urls": [
            {
              "display_url": "llamaindex.ai",
              "expanded_url": "https://www.llamaindex.ai/",
              "indices": [
                0,
                23
              ],
              "url": "https://t.co/YiIfjVl1ly"
            }
          ]
        }
      }
    },
    "isAutomated": false,
    "automatedBy": null
  },
  "extendedEntities": {},
  "card": null,
  "place": {},
  "entities": {
    "hashtags": [],
    "symbols": [],
    "urls": [],
    "user_mentions": [
      {
        "id_str": "1604278358296055808",
        "indices": [
          788,
          800
        ],
        "name": "LlamaIndex 🦙",
        "screen_name": "llama_index"
      }
    ]
  },
  "quoted_tweet": {
    "type": "tweet",
    "id": "2028256501115343157",
    "url": "https://x.com/bearlyai/status/2028256501115343157",
    "twitterUrl": "https://twitter.com/bearlyai/status/2028256501115343157",
    "text": "The Verge with technical details on why AI is so bad at reading PDF files: https://t.co/bSvQaoK4Uu",
    "source": "Twitter for iPhone",
    "retweetCount": 74,
    "replyCount": 30,
    "likeCount": 695,
    "quoteCount": 21,
    "viewCount": 310387,
    "createdAt": "Sun Mar 01 23:50:07 +0000 2026",
    "lang": "en",
    "bookmarkCount": 485,
    "isReply": false,
    "inReplyToId": null,
    "conversationId": "2028256501115343157",
    "displayTextRange": [
      0,
      74
    ],
    "inReplyToUserId": null,
    "inReplyToUsername": null,
    "author": {
      "type": "user",
      "userName": "bearlyai",
      "url": "https://x.com/bearlyai",
      "twitterUrl": "https://twitter.com/bearlyai",
      "id": "1579981290840932352",
      "name": "Bearly AI",
      "isVerified": false,
      "isBlueVerified": true,
      "verifiedType": null,
      "profilePicture": "https://pbs.twimg.com/profile_images/1579982083598032896/F7qybk67_normal.jpg",
      "coverPicture": "https://pbs.twimg.com/profile_banners/1579981290840932352/1732068193",
      "description": "",
      "location": "Book a Demo ➡️",
      "followers": 21538,
      "following": 2,
      "status": "",
      "canDm": true,
      "canMediaTag": true,
      "createdAt": "Tue Oct 11 23:44:50 +0000 2022",
      "entities": {
        "description": {
          "urls": []
        },
        "url": {}
      },
      "fastFollowersCount": 0,
      "favouritesCount": 6622,
      "hasCustomTimelines": true,
      "isTranslator": false,
      "mediaCount": 902,
      "statusesCount": 1949,
      "withheldInCountries": [],
      "affiliatesHighlightedLabel": {},
      "possiblySensitive": false,
      "pinnedTweetIds": [
        "1947081034295804247"
      ],
      "profile_bio": {
        "description": "Privacy-first AI research tool with access to ChatGPT, Grok, Claude, Gemini and DeepSeek in one app. Try it at https://t.co/7C2QxQNem1 (by @pnegahdar and @trungtphan)",
        "entities": {
          "description": {
            "hashtags": [],
            "symbols": [],
            "urls": [
              {
                "display_url": "Bearly.AI",
                "expanded_url": "http://Bearly.AI",
                "indices": [
                  111,
                  134
                ],
                "url": "https://t.co/7C2QxQNem1"
              }
            ],
            "user_mentions": [
              {
                "id_str": "0",
                "indices": [
                  139,
                  149
                ],
                "name": "",
                "screen_name": "pnegahdar"
              },
              {
                "id_str": "0",
                "indices": [
                  154,
                  165
                ],
                "name": "",
                "screen_name": "trungtphan"
              }
            ]
          },
          "url": {
            "urls": [
              {
                "display_url": "cal.com/bearly/demo",
                "expanded_url": "https://cal.com/bearly/demo",
                "indices": [
                  0,
                  23
                ],
                "url": "https://t.co/vaztOiBjad"
              }
            ]
          }
        }
      },
      "isAutomated": false,
      "automatedBy": null
    },
    "extendedEntities": {
      "media": [
        {
          "display_url": "pic.twitter.com/bSvQaoK4Uu",
          "expanded_url": "https://twitter.com/bearlyai/status/2028256501115343157/photo/1",
          "ext_media_availability": {
            "status": "Available"
          },
          "features": {
            "large": {
              "faces": []
            },
            "orig": {
              "faces": []
            }
          },
          "id_str": "2028256333087322112",
          "indices": [
            75,
            98
          ],
          "media_key": "3_2028256333087322112",
          "media_results": {
            "id": "QXBpTWVkaWFSZXN1bHRzOgwAAQoAARwl0GOHGzAACgACHCXQiqZbUTUAAA==",
            "result": {
              "__typename": "ApiMedia",
              "id": "QXBpTWVkaWE6DAABCgABHCXQY4cbMAAKAAIcJdCKpltRNQAA",
              "media_key": "3_2028256333087322112"
            }
          },
          "media_url_https": "https://pbs.twimg.com/media/HCXQY4cbMAA7Q_V.jpg",
          "original_info": {
            "focus_rects": [
              {
                "h": 726,
                "w": 1296,
                "x": 0,
                "y": 317
              },
              {
                "h": 1152,
                "w": 1152,
                "x": 0,
                "y": 0
              },
              {
                "h": 1152,
                "w": 1011,
                "x": 0,
                "y": 0
              },
              {
                "h": 1152,
                "w": 576,
                "x": 3,
                "y": 0
              },
              {
                "h": 1152,
                "w": 1296,
                "x": 0,
                "y": 0
              }
            ],
            "height": 1152,
            "width": 1296
          },
          "sizes": {
            "large": {
              "h": 1152,
              "w": 1296
            }
          },
          "type": "photo",
          "url": "https://t.co/bSvQaoK4Uu"
        }
      ]
    },
    "card": null,
    "place": {},
    "entities": {
      "hashtags": [],
      "symbols": [],
      "timestamps": [],
      "urls": [],
      "user_mentions": []
    },
    "quoted_tweet": null,
    "retweeted_tweet": null,
    "isLimitedReply": false,
    "article": null
  },
  "retweeted_tweet": null,
  "isLimitedReply": false,
  "article": null
}