@janleike
Moreover, you can train out the misaligned behavior using regular RLHF using ~in-distribution prompts https://t.co/sQRL4o32Y7
Viewing enriched Twitter post
Moreover, you can train out the misaligned behavior using regular RLHF using ~in-distribution prompts https://t.co/sQRL4o32Y7
{
"media": [
{
"type": "photo",
"url": "https://crmoxkoizveukayfjuyo.supabase.co/storage/v1/object/public/media/posts/1991955834730160200/media_0.jpg?",
"filename": "media_0.jpg"
}
],
"processed_at": "2025-11-27T20:17:15.424603",
"pipeline_version": "2.0"
} {
"type": "tweet",
"id": "1991955834730160200",
"url": "https://x.com/janleike/status/1991955834730160200",
"twitterUrl": "https://twitter.com/janleike/status/1991955834730160200",
"text": "Moreover, you can train out the misaligned behavior using regular RLHF using ~in-distribution prompts https://t.co/sQRL4o32Y7",
"source": "Twitter for iPhone",
"retweetCount": 1,
"replyCount": 1,
"likeCount": 24,
"quoteCount": 1,
"viewCount": 1345,
"createdAt": "Fri Nov 21 19:44:14 +0000 2025",
"lang": "en",
"bookmarkCount": 2,
"isReply": true,
"inReplyToId": "1991955832637161644",
"conversationId": "1991955830040863011",
"displayTextRange": [
0,
101
],
"inReplyToUserId": "710610891058716673",
"inReplyToUsername": "janleike",
"author": {
"type": "user",
"userName": "janleike",
"url": "https://x.com/janleike",
"twitterUrl": "https://twitter.com/janleike",
"id": "710610891058716673",
"name": "Jan Leike",
"isVerified": false,
"isBlueVerified": true,
"verifiedType": null,
"profilePicture": "https://pbs.twimg.com/profile_images/1077523091700502528/2YCa_F4o_normal.jpg",
"coverPicture": "",
"description": "",
"location": "San Francisco, USA",
"followers": 117612,
"following": 332,
"status": "",
"canDm": true,
"canMediaTag": false,
"createdAt": "Thu Mar 17 23:36:53 +0000 2016",
"entities": {
"description": {
"urls": []
},
"url": {}
},
"fastFollowersCount": 0,
"favouritesCount": 3645,
"hasCustomTimelines": true,
"isTranslator": false,
"mediaCount": 40,
"statusesCount": 756,
"withheldInCountries": [],
"affiliatesHighlightedLabel": {},
"possiblySensitive": false,
"pinnedTweetIds": [
"1795497960509448617"
],
"profile_bio": {
"description": "Alignment team lead @AnthropicAI. Previously OpenAI & DeepMind.\nOptimizing for a post-AGI future where humanity flourishes.\nOpinions aren't my employer's.",
"entities": {
"description": {
"user_mentions": [
{
"id_str": "0",
"indices": [
20,
32
],
"name": "",
"screen_name": "AnthropicAI"
}
]
},
"url": {
"urls": [
{
"display_url": "jan.leike.name",
"expanded_url": "https://jan.leike.name/",
"indices": [
0,
23
],
"url": "https://t.co/Uvp4pU8R0f"
}
]
}
}
},
"isAutomated": false,
"automatedBy": null
},
"extendedEntities": {
"media": [
{
"display_url": "pic.twitter.com/sQRL4o32Y7",
"expanded_url": "https://twitter.com/janleike/status/1991955834730160200/photo/1",
"ext_media_availability": {
"status": "Available"
},
"features": {
"large": {},
"orig": {}
},
"id_str": "1991954701831815168",
"indices": [
102,
125
],
"media_key": "3_1991954701831815168",
"media_results": {
"id": "QXBpTWVkaWFSZXN1bHRzOgwAAQoAARuk2D9bWsAACgACG6TZRyFboEgAAA==",
"result": {
"__typename": "ApiMedia",
"id": "QXBpTWVkaWE6DAABCgABG6TYP1tawAAKAAIbpNlHIVugSAAA",
"media_key": "3_1991954701831815168"
}
},
"media_url_https": "https://pbs.twimg.com/media/G6TYP1tawAAq9mA.jpg",
"original_info": {
"focus_rects": [
{
"h": 932,
"w": 1664,
"x": 0,
"y": 0
},
{
"h": 932,
"w": 932,
"x": 0,
"y": 0
},
{
"h": 932,
"w": 818,
"x": 0,
"y": 0
},
{
"h": 932,
"w": 466,
"x": 49,
"y": 0
},
{
"h": 932,
"w": 2262,
"x": 0,
"y": 0
}
],
"height": 932,
"width": 2262
},
"sizes": {
"large": {
"h": 844,
"w": 2048
}
},
"type": "photo",
"url": "https://t.co/sQRL4o32Y7"
}
]
},
"card": null,
"place": {},
"entities": {},
"quoted_tweet": null,
"retweeted_tweet": null,
"isLimitedReply": false,
"article": null
}