🐦 Twitter Post Details

Viewing enriched Twitter post

@AnthropicAI

Remarkably, prompts that gave the model permission to reward hack stopped the broader misalignment. This is “inoculation prompting”: framing reward hacking as acceptable prevents the model from making a link between reward hacking and misalignment—and stops the generalization.

View on Twitter

📊 Media Metadata

{
  "media": [
    {
      "type": "photo",
      "url": "https://crmoxkoizveukayfjuyo.supabase.co/storage/v1/object/public/media/posts/1991952432797290528/media_0.png?",
      "filename": "media_0.png"
    }
  ],
  "processed_at": "2025-11-27T20:51:07.479592",
  "pipeline_version": "2.0"
}

🔧 Raw API Response

{
  "type": "tweet",
  "id": "1991952432797290528",
  "url": "https://x.com/AnthropicAI/status/1991952432797290528",
  "twitterUrl": "https://twitter.com/AnthropicAI/status/1991952432797290528",
  "text": "Remarkably, prompts that gave the model permission to reward hack stopped the broader misalignment.\n\nThis is “inoculation prompting”: framing reward hacking as acceptable prevents the model from making a link between reward hacking and misalignment—and stops the generalization.",
  "source": "Twitter for iPhone",
  "retweetCount": 137,
  "replyCount": 35,
  "likeCount": 1500,
  "quoteCount": 125,
  "viewCount": 438612,
  "createdAt": "Fri Nov 21 19:30:43 +0000 2025",
  "lang": "en",
  "bookmarkCount": 493,
  "isReply": true,
  "inReplyToId": "1991952429102031208",
  "conversationId": "1991952400899559889",
  "displayTextRange": [
    0,
    278
  ],
  "inReplyToUserId": "1353836358901501952",
  "inReplyToUsername": "AnthropicAI",
  "author": {
    "type": "user",
    "userName": "AnthropicAI",
    "url": "https://x.com/AnthropicAI",
    "twitterUrl": "https://twitter.com/AnthropicAI",
    "id": "1353836358901501952",
    "name": "Anthropic",
    "isVerified": false,
    "isBlueVerified": true,
    "verifiedType": "Business",
    "profilePicture": "https://pbs.twimg.com/profile_images/1798110641414443008/XP8gyBaY_normal.jpg",
    "coverPicture": "https://pbs.twimg.com/profile_banners/1353836358901501952/1719228429",
    "description": "",
    "location": "",
    "followers": 699011,
    "following": 35,
    "status": "",
    "canDm": false,
    "canMediaTag": true,
    "createdAt": "Mon Jan 25 22:45:28 +0000 2021",
    "entities": {
      "description": {
        "urls": []
      },
      "url": {}
    },
    "fastFollowersCount": 0,
    "favouritesCount": 1478,
    "hasCustomTimelines": true,
    "isTranslator": false,
    "mediaCount": 513,
    "statusesCount": 1245,
    "withheldInCountries": [],
    "affiliatesHighlightedLabel": {},
    "possiblySensitive": false,
    "pinnedTweetIds": [],
    "profile_bio": {
      "description": "We're an AI safety and research company that builds reliable, interpretable, and steerable AI systems. Talk to our AI assistant @claudeai on https://t.co/FhDI3KQh0n.",
      "entities": {
        "description": {
          "urls": [
            {
              "display_url": "claude.ai",
              "expanded_url": "https://claude.ai",
              "indices": [
                141,
                164
              ],
              "url": "https://t.co/FhDI3KQh0n"
            }
          ],
          "user_mentions": [
            {
              "id_str": "0",
              "indices": [
                128,
                137
              ],
              "name": "",
              "screen_name": "claudeai"
            }
          ]
        },
        "url": {
          "urls": [
            {
              "display_url": "anthropic.com",
              "expanded_url": "https://anthropic.com",
              "indices": [
                0,
                23
              ],
              "url": "https://t.co/w94SABjAXZ"
            }
          ]
        }
      }
    },
    "isAutomated": false,
    "automatedBy": null
  },
  "extendedEntities": {
    "media": [
      {
        "allow_download_status": {
          "allow_download": true
        },
        "display_url": "pic.twitter.com/ZUGnmcOYNV",
        "expanded_url": "https://twitter.com/AnthropicAI/status/1991952432797290528/photo/1",
        "ext_alt_text": "RL runs using different system prompts. All runs learned similar rates of reward hacking, but showed very different rates of misalignment. When the model is instructed to reward hack, or told that it should simply try to make the grading script pass in this unusual situation (implying that reward hacking is acceptable), the model still learns to reward hack pervasively, but this does not generalize to broad misalignment.",
        "ext_media_availability": {
          "status": "Available"
        },
        "features": {
          "large": {},
          "orig": {}
        },
        "id_str": "1991938937611313152",
        "indices": [
          279,
          302
        ],
        "media_key": "3_1991938937611313152",
        "media_results": {
          "id": "QXBpTWVkaWFSZXN1bHRzOgwAAQoAARukyej2l2AACgACG6TWLw5bkCAAAA==",
          "result": {
            "__typename": "ApiMedia",
            "id": "QXBpTWVkaWE6DAABCgABG6TJ6PaXYAAKAAIbpNYvDluQIAAA",
            "media_key": "3_1991938937611313152"
          }
        },
        "media_url_https": "https://pbs.twimg.com/media/G6TJ6PaXYAAWKDz.png",
        "original_info": {
          "focus_rects": [
            {
              "h": 714,
              "w": 1275,
              "x": 645,
              "y": 0
            },
            {
              "h": 714,
              "w": 714,
              "x": 1131,
              "y": 0
            },
            {
              "h": 714,
              "w": 626,
              "x": 1175,
              "y": 0
            },
            {
              "h": 714,
              "w": 357,
              "x": 1310,
              "y": 0
            },
            {
              "h": 714,
              "w": 1920,
              "x": 0,
              "y": 0
            }
          ],
          "height": 714,
          "width": 1920
        },
        "sizes": {
          "large": {
            "h": 714,
            "w": 1920
          }
        },
        "type": "photo",
        "url": "https://t.co/ZUGnmcOYNV"
      }
    ]
  },
  "card": null,
  "place": {},
  "entities": {},
  "quoted_tweet": null,
  "retweeted_tweet": null,
  "isLimitedReply": false,
  "article": null
}