🐦 Twitter Post Details

Viewing enriched Twitter post

@PKirgis

In our most recent evaluations at @halevals, we found Claude Opus 4.5 solves CORE-Bench. How? Opus 4.5 solves CORE-Bench because it creatively resolves dependency conflicts, bypasses environmental barriers via nuanced benchmark editing, and follows instructions with high fidelity. Opus 4.1 and Sonnet 4, when given the same powerful scaffold, fail because they resort to simulated data when running into conflicts and provide answers using heuristics rather than precise data. We also observe Opus 4.5 more accurately representing its actions in its summary workflow, displaying stronger agentic alignment. 🧵

View on Twitter

📊 Media Metadata

{
  "media": [
    {
      "url": "https://crmoxkoizveukayfjuyo.supabase.co/storage/v1/object/public/media/posts/2000971958956016120/media_0.jpg?",
      "media_url": "https://crmoxkoizveukayfjuyo.supabase.co/storage/v1/object/public/media/posts/2000971958956016120/media_0.jpg?",
      "type": "photo",
      "filename": "media_0.jpg"
    }
  ],
  "processed_at": "2025-12-16T16:58:22.627729",
  "pipeline_version": "2.0"
}

🔧 Raw API Response

{
  "type": "tweet",
  "id": "2000971958956016120",
  "url": "https://x.com/PKirgis/status/2000971958956016120",
  "twitterUrl": "https://twitter.com/PKirgis/status/2000971958956016120",
  "text": "In our most recent evaluations at @halevals, we found Claude Opus 4.5 solves CORE-Bench. How? \n\nOpus 4.5 solves CORE-Bench because it creatively resolves dependency conflicts, bypasses environmental barriers via nuanced benchmark editing, and follows instructions with high fidelity. Opus 4.1 and Sonnet 4, when given the same powerful scaffold, fail because they resort to simulated data when running into conflicts and provide answers using heuristics rather than precise data. We also observe Opus 4.5 more accurately representing its actions in its summary workflow, displaying stronger agentic alignment. 🧵",
  "source": "Twitter for iPhone",
  "retweetCount": 3,
  "replyCount": 1,
  "likeCount": 2,
  "quoteCount": 1,
  "viewCount": 129,
  "createdAt": "Tue Dec 16 16:51:05 +0000 2025",
  "lang": "en",
  "bookmarkCount": 1,
  "isReply": false,
  "inReplyToId": null,
  "conversationId": "2000971958956016120",
  "displayTextRange": [
    0,
    274
  ],
  "inReplyToUserId": null,
  "inReplyToUsername": null,
  "author": {
    "type": "user",
    "userName": "PKirgis",
    "url": "https://x.com/PKirgis",
    "twitterUrl": "https://twitter.com/PKirgis",
    "id": "1036066345547444225",
    "name": "Peter Kirgis",
    "isVerified": false,
    "isBlueVerified": false,
    "verifiedType": null,
    "profilePicture": "https://pbs.twimg.com/profile_images/1953528357653762048/B8dXa0bg_normal.jpg",
    "coverPicture": "",
    "description": "",
    "location": "",
    "followers": 67,
    "following": 238,
    "status": "",
    "canDm": false,
    "canMediaTag": true,
    "createdAt": "Sun Sep 02 01:40:40 +0000 2018",
    "entities": {
      "description": {
        "urls": []
      },
      "url": {}
    },
    "fastFollowersCount": 0,
    "favouritesCount": 54,
    "hasCustomTimelines": true,
    "isTranslator": false,
    "mediaCount": 17,
    "statusesCount": 64,
    "withheldInCountries": [],
    "affiliatesHighlightedLabel": {},
    "possiblySensitive": false,
    "pinnedTweetIds": [],
    "profile_bio": {
      "description": "AI Researcher @PrincetonCITP",
      "entities": {
        "description": {
          "user_mentions": [
            {
              "id_str": "0",
              "indices": [
                14,
                28
              ],
              "name": "",
              "screen_name": "PrincetonCITP"
            }
          ]
        },
        "url": {
          "urls": [
            {
              "display_url": "peterkirgis.github.io",
              "expanded_url": "http://peterkirgis.github.io",
              "indices": [
                0,
                23
              ],
              "url": "https://t.co/6jquEaKWhk"
            }
          ]
        }
      }
    },
    "isAutomated": false,
    "automatedBy": null
  },
  "extendedEntities": {
    "media": [
      {
        "allow_download_status": {
          "allow_download": true
        },
        "display_url": "pic.twitter.com/8Xzizvgwy6",
        "expanded_url": "https://twitter.com/PKirgis/status/2000971958956016120/photo/1",
        "ext_media_availability": {
          "status": "Available"
        },
        "features": {
          "large": {},
          "orig": {}
        },
        "id_str": "2000968752582561797",
        "indices": [
          275,
          298
        ],
        "media_key": "3_2000968752582561797",
        "media_results": {
          "id": "QXBpTWVkaWFSZXN1bHRzOgwAAQoAARvE3nqYVxAFCgACG8ThZSMXQfgAAA==",
          "result": {
            "__typename": "ApiMedia",
            "id": "QXBpTWVkaWE6DAABCgABG8TeephXEAUKAAIbxOFlIxdB+AAA",
            "media_key": "3_2000968752582561797"
          }
        },
        "media_url_https": "https://pbs.twimg.com/media/G8TeephXEAUP6h3.jpg",
        "original_info": {
          "focus_rects": [
            {
              "h": 1782,
              "w": 3182,
              "x": 457,
              "y": 0
            },
            {
              "h": 1782,
              "w": 1782,
              "x": 1157,
              "y": 0
            },
            {
              "h": 1782,
              "w": 1563,
              "x": 1267,
              "y": 0
            },
            {
              "h": 1782,
              "w": 891,
              "x": 1603,
              "y": 0
            },
            {
              "h": 1782,
              "w": 4096,
              "x": 0,
              "y": 0
            }
          ],
          "height": 1782,
          "width": 4096
        },
        "sizes": {
          "large": {
            "h": 891,
            "w": 2048
          }
        },
        "type": "photo",
        "url": "https://t.co/8Xzizvgwy6"
      }
    ]
  },
  "card": null,
  "place": {},
  "entities": {
    "user_mentions": [
      {
        "id_str": "1968001856317038592",
        "indices": [
          34,
          43
        ],
        "name": "Holistic Agent Leaderboard (hal.cs.princeton.edu)",
        "screen_name": "halevals"
      }
    ]
  },
  "quoted_tweet": null,
  "retweeted_tweet": null,
  "isLimitedReply": false,
  "article": null
}