🐦 Twitter Post Details

Viewing enriched Twitter post

@sayashk

On our evals for HAL, we found that agents figure out they're being evaluated even on capability evals. For example, here Claude 3.7 Sonnet *looks up the benchmark on HuggingFace* to find the answer to an AssistantBench question. There were many such cases across benchmarks and models. Of course, you can make it harder for the agent to cheat, such as by blocking HuggingFace or encrypting the dataset. But so long as the benchmark is available somewhere in public, an agent could theoretically follow the same steps a human would to access it (e.g., decrypting a password-protected benchmark on its sandbox). So agent log analysis will become necessary even for capability evals. HAL now has logs from 20,000+ rollouts across 9 benchmarks, and we are analyzing all of these logs using @TransluceAI's Docent.

Media 1

📊 Media Metadata

{
  "media": [
    {
      "url": "https://crmoxkoizveukayfjuyo.supabase.co/storage/v1/object/public/media/posts/1973408326324265462/media_0.jpg?",
      "media_url": "https://crmoxkoizveukayfjuyo.supabase.co/storage/v1/object/public/media/posts/1973408326324265462/media_0.jpg?",
      "type": "photo",
      "filename": "media_0.jpg"
    }
  ],
  "processed_at": "2025-10-06T13:30:57.022458",
  "pipeline_version": "2.0"
}

🔧 Raw API Response

{
  "type": "tweet",
  "id": "1973408326324265462",
  "url": "https://x.com/sayashk/status/1973408326324265462",
  "twitterUrl": "https://twitter.com/sayashk/status/1973408326324265462",
  "text": "On our evals for HAL, we found that agents figure out they're being evaluated even on capability evals. \n\nFor example, here Claude 3.7 Sonnet *looks up the benchmark on HuggingFace* to find the answer to an AssistantBench question. There were many such cases across benchmarks and models. \n\nOf course, you can make it harder for the agent to cheat, such as by blocking HuggingFace or encrypting the dataset. But so long as the benchmark is available somewhere in public, an agent could theoretically follow the same steps a human would to access it (e.g., decrypting a password-protected benchmark on its sandbox). \n\nSo agent log analysis will become necessary even for capability evals. HAL now has logs from 20,000+ rollouts across 9 benchmarks, and we are analyzing all of these logs using @TransluceAI's Docent.",
  "source": "Twitter for iPhone",
  "retweetCount": 13,
  "replyCount": 2,
  "likeCount": 38,
  "quoteCount": 0,
  "viewCount": 10091,
  "createdAt": "Wed Oct 01 15:23:03 +0000 2025",
  "lang": "en",
  "bookmarkCount": 11,
  "isReply": false,
  "inReplyToId": null,
  "conversationId": "1973408326324265462",
  "displayTextRange": [
    0,
    281
  ],
  "inReplyToUserId": null,
  "inReplyToUsername": null,
  "author": {
    "type": "user",
    "userName": "sayashk",
    "url": "https://x.com/sayashk",
    "twitterUrl": "https://twitter.com/sayashk",
    "id": "3084274082",
    "name": "Sayash Kapoor",
    "isVerified": false,
    "isBlueVerified": false,
    "verifiedType": null,
    "profilePicture": "https://pbs.twimg.com/profile_images/1521238232867946496/U_GCI43e_normal.jpg",
    "coverPicture": "https://pbs.twimg.com/profile_banners/3084274082/1712802699",
    "description": "",
    "location": "Princeton",
    "followers": 10399,
    "following": 2103,
    "status": "",
    "canDm": true,
    "canMediaTag": true,
    "createdAt": "Sun Mar 15 09:03:24 +0000 2015",
    "entities": {
      "description": {
        "urls": []
      },
      "url": {}
    },
    "fastFollowersCount": 0,
    "favouritesCount": 4163,
    "hasCustomTimelines": true,
    "isTranslator": false,
    "mediaCount": 192,
    "statusesCount": 1316,
    "withheldInCountries": [],
    "affiliatesHighlightedLabel": {},
    "possiblySensitive": false,
    "pinnedTweetIds": [
      "1967998405852152039"
    ],
    "profile_bio": {
      "description": "CS PhD candidate @PrincetonCITP. I tweet about AI agents, AI evals, AI for science.\nAI as Normal Technology: https://t.co/5amOkqKDf2\nBook: https://t.co/DabpkhNrcM",
      "entities": {
        "description": {
          "urls": [
            {
              "display_url": "bit.ly/ai-nt",
              "expanded_url": "http://bit.ly/ai-nt",
              "indices": [
                109,
                132
              ],
              "url": "https://t.co/5amOkqKDf2"
            },
            {
              "display_url": "bit.ly/ai-so",
              "expanded_url": "http://bit.ly/ai-so",
              "indices": [
                139,
                162
              ],
              "url": "https://t.co/DabpkhNrcM"
            }
          ],
          "user_mentions": [
            {
              "id_str": "0",
              "indices": [
                17,
                31
              ],
              "name": "",
              "screen_name": "PrincetonCITP"
            }
          ]
        },
        "url": {
          "urls": [
            {
              "display_url": "cs.princeton.edu/~sayashk",
              "expanded_url": "http://cs.princeton.edu/~sayashk",
              "indices": [
                0,
                23
              ],
              "url": "https://t.co/wO9NAWB11f"
            }
          ]
        }
      }
    },
    "isAutomated": false,
    "automatedBy": null
  },
  "extendedEntities": {
    "media": [
      {
        "allow_download_status": {
          "allow_download": true
        },
        "display_url": "pic.twitter.com/HSoLisS8Bk",
        "expanded_url": "https://twitter.com/sayashk/status/1973408326324265462/photo/1",
        "ext_media_availability": {
          "status": "Available"
        },
        "features": {
          "large": {},
          "orig": {}
        },
        "id_str": "1973406250454097925",
        "indices": [
          282,
          305
        ],
        "media_key": "3_1973406250454097925",
        "media_results": {
          "id": "QXBpTWVkaWFSZXN1bHRzOgwAAQoAARti8od72iAFCgACG2L0as9bIfYAAA==",
          "result": {
            "__typename": "ApiMedia",
            "id": "QXBpTWVkaWE6DAABCgABG2Lyh3vaIAUKAAIbYvRqz1sh9gAA",
            "media_key": "3_1973406250454097925"
          }
        },
        "media_url_https": "https://pbs.twimg.com/media/G2Lyh3vaIAU7Brq.jpg",
        "original_info": {
          "focus_rects": [
            {
              "h": 639,
              "w": 1141,
              "x": 0,
              "y": 52
            },
            {
              "h": 691,
              "w": 691,
              "x": 225,
              "y": 0
            },
            {
              "h": 691,
              "w": 606,
              "x": 267,
              "y": 0
            },
            {
              "h": 691,
              "w": 346,
              "x": 397,
              "y": 0
            },
            {
              "h": 691,
              "w": 1141,
              "x": 0,
              "y": 0
            }
          ],
          "height": 691,
          "width": 1141
        },
        "sizes": {
          "large": {
            "h": 691,
            "w": 1141
          }
        },
        "type": "photo",
        "url": "https://t.co/HSoLisS8Bk"
      }
    ]
  },
  "card": null,
  "place": {},
  "entities": {
    "user_mentions": [
      {
        "id_str": "1844990000754196482",
        "indices": [
          793,
          805
        ],
        "name": "Transluce",
        "screen_name": "TransluceAI"
      }
    ]
  },
  "quoted_tweet": {
    "type": "tweet",
    "id": "1973402363563028579",
    "url": "https://x.com/1a3orn/status/1973402363563028579",
    "twitterUrl": "https://twitter.com/1a3orn/status/1973402363563028579",
    "text": "To make a model that *doesn't* instantly learn to distinguish between \"fake-ass alignment test\" and \"normal task.\"\n\n...seems like the first thing to do seems like it would be \"make all alignment evals very small variations on actual capability evals.\"\n\nDo people do this?",
    "source": "Twitter for iPhone",
    "retweetCount": 0,
    "replyCount": 2,
    "likeCount": 10,
    "quoteCount": 1,
    "viewCount": 10705,
    "createdAt": "Wed Oct 01 14:59:22 +0000 2025",
    "lang": "en",
    "bookmarkCount": 0,
    "isReply": false,
    "inReplyToId": null,
    "conversationId": "1973402363563028579",
    "displayTextRange": [
      0,
      271
    ],
    "inReplyToUserId": null,
    "inReplyToUsername": null,
    "author": {
      "type": "user",
      "userName": "1a3orn",
      "url": "https://x.com/1a3orn",
      "twitterUrl": "https://twitter.com/1a3orn",
      "id": "1244991707239919617",
      "name": "1a3orn",
      "isVerified": false,
      "isBlueVerified": false,
      "verifiedType": null,
      "profilePicture": "https://pbs.twimg.com/profile_images/1432072030585769992/nl6gkDqm_normal.jpg",
      "coverPicture": "",
      "description": "",
      "location": "",
      "followers": 2563,
      "following": 1667,
      "status": "",
      "canDm": false,
      "canMediaTag": true,
      "createdAt": "Tue Mar 31 14:17:34 +0000 2020",
      "entities": {
        "description": {
          "urls": []
        },
        "url": {}
      },
      "fastFollowersCount": 0,
      "favouritesCount": 21696,
      "hasCustomTimelines": true,
      "isTranslator": false,
      "mediaCount": 213,
      "statusesCount": 2554,
      "withheldInCountries": [],
      "affiliatesHighlightedLabel": {},
      "possiblySensitive": false,
      "pinnedTweetIds": [],
      "profile_bio": {
        "description": "https://t.co/5ycsCvXFE5",
        "entities": {
          "description": {
            "urls": [
              {
                "display_url": "1a3orn.com",
                "expanded_url": "http://1a3orn.com",
                "indices": [
                  0,
                  23
                ],
                "url": "https://t.co/5ycsCvXFE5"
              }
            ]
          }
        }
      },
      "isAutomated": false,
      "automatedBy": null
    },
    "extendedEntities": {},
    "card": null,
    "place": {},
    "entities": {},
    "quoted_tweet": null,
    "retweeted_tweet": null,
    "isLimitedReply": false,
    "article": null
  },
  "retweeted_tweet": null,
  "isLimitedReply": false,
  "article": null
}