@ylecun
RT @zhuokaiz: Latent world models learn differentiable dynamics in a learned representation space, which should make planning as simple as…
Viewing enriched Twitter post
RT @zhuokaiz: Latent world models learn differentiable dynamics in a learned representation space, which should make planning as simple as…
{
"score": 0.36,
"score_components": {
"author": 0.09,
"engagement": 0.0,
"quality": 0.06000000000000001,
"source": 0.135,
"nlp": 0.05,
"recency": 0.025
},
"scored_at": "2026-03-15T14:07:15.435656",
"import_source": "api_import",
"source_tagged_at": "2026-03-15T14:07:15.435669",
"enriched": true,
"enriched_at": "2026-03-15T14:07:15.435672"
} {
"type": "tweet",
"id": "2033181594156986794",
"url": "https://x.com/ylecun/status/2033181594156986794",
"twitterUrl": "https://twitter.com/ylecun/status/2033181594156986794",
"text": "RT @zhuokaiz: Latent world models learn differentiable dynamics in a learned representation space, which should make planning as simple as…",
"source": "Twitter for iPhone",
"retweetCount": 15,
"replyCount": 4,
"likeCount": 132,
"quoteCount": 3,
"viewCount": 10136,
"createdAt": "Sun Mar 15 14:00:41 +0000 2026",
"lang": "en",
"bookmarkCount": 133,
"isReply": false,
"inReplyToId": null,
"conversationId": "2033181594156986794",
"displayTextRange": [
0,
139
],
"inReplyToUserId": null,
"inReplyToUsername": null,
"author": {
"type": "user",
"userName": "ylecun",
"url": "https://x.com/ylecun",
"twitterUrl": "https://twitter.com/ylecun",
"id": "48008938",
"name": "Yann LeCun",
"isVerified": false,
"isBlueVerified": true,
"verifiedType": null,
"profilePicture": "https://pbs.twimg.com/profile_images/1483577865056702469/rWA-3_T7_normal.jpg",
"coverPicture": "https://pbs.twimg.com/profile_banners/48008938/1642547502",
"description": "",
"location": "New York",
"followers": 1075293,
"following": 778,
"status": "",
"canDm": false,
"canMediaTag": true,
"createdAt": "Wed Jun 17 16:05:51 +0000 2009",
"entities": {
"description": {
"urls": []
},
"url": {}
},
"fastFollowersCount": 0,
"favouritesCount": 27477,
"hasCustomTimelines": true,
"isTranslator": false,
"mediaCount": 461,
"statusesCount": 25323,
"withheldInCountries": [],
"affiliatesHighlightedLabel": {},
"possiblySensitive": false,
"pinnedTweetIds": [
"1862598063275061484"
],
"profile_bio": {
"description": "Professor at NYU & Executive Chairman at AMI Labs. \nEx-Chief AI Scientist at Meta.\nResearcher in AI, Machine Learning, Robotics, etc.\nACM Turing Award Laureate.",
"entities": {
"description": {
"hashtags": [],
"symbols": [],
"urls": [],
"user_mentions": []
},
"url": {
"urls": [
{
"display_url": "yann.lecun.com",
"expanded_url": "http://yann.lecun.com",
"indices": [
0,
23
],
"url": "https://t.co/POp7IBHfXy"
}
]
}
}
},
"isAutomated": false,
"automatedBy": null
},
"extendedEntities": {},
"card": {
"binding_values": [
{
"key": "thumbnail_image",
"value": {
"image_value": {
"alt": "arXiv logo",
"height": 144,
"url": "https://pbs.twimg.com/card_img/2031482977276878848/A3K92F3y?format=jpg&name=144x144_2",
"width": 144
}
}
},
{
"key": "description",
"value": {
"string_value": "Learning good representations is essential for latent planning with world models. While pretrained visual encoders produce strong semantic visual features, they are not tailored to planning and..."
}
},
{
"key": "domain",
"value": {
"string_value": "arxiv.org"
}
},
{
"key": "thumbnail_image_large",
"value": {
"image_value": {
"alt": "arXiv logo",
"height": 420,
"url": "https://pbs.twimg.com/card_img/2031482977276878848/A3K92F3y?format=jpg&name=420x420_2",
"width": 420
}
}
},
{
"key": "thumbnail_image_original",
"value": {
"image_value": {
"alt": "arXiv logo",
"height": 1000,
"url": "https://pbs.twimg.com/card_img/2031482977276878848/A3K92F3y?format=jpg&name=orig",
"width": 1000
}
}
},
{
"key": "site",
"value": {
"scribe_key": "publisher_id",
"user_value": {
"id_str": "808633423300624384",
"path": []
}
}
},
{
"key": "thumbnail_image_small",
"value": {
"image_value": {
"alt": "arXiv logo",
"height": 100,
"url": "https://pbs.twimg.com/card_img/2031482977276878848/A3K92F3y?format=jpg&name=100x100_2",
"width": 100
}
}
},
{
"key": "thumbnail_image_x_large",
"value": {
"image_value": {
"alt": "arXiv logo",
"height": 1000,
"url": "https://pbs.twimg.com/card_img/2031482977276878848/A3K92F3y?format=png&name=2048x2048_2_exp",
"width": 1000
}
}
},
{
"key": "thumbnail_image_alt_text",
"value": {
"string_value": "arXiv logo"
}
},
{
"key": "vanity_url",
"value": {
"scribe_key": "vanity_url",
"string_value": "arxiv.org"
}
},
{
"key": "thumbnail_image_color",
"value": {
"image_color_value": {
"palette": [
{
"percentage": 94.17,
"rgb": {
"blue": 255,
"green": 255,
"red": 255
}
},
{
"percentage": 4.33,
"rgb": {
"blue": 105,
"green": 116,
"red": 124
}
},
{
"percentage": 1.26,
"rgb": {
"blue": 46,
"green": 21,
"red": 170
}
},
{
"percentage": 0.23,
"rgb": {
"blue": 131,
"green": 116,
"red": 203
}
}
]
}
}
},
{
"key": "title",
"value": {
"string_value": "Temporal Straightening for Latent Planning"
}
},
{
"key": "card_url",
"value": {
"scribe_key": "card_url",
"string_value": "https://t.co/NLPGxqbP2x"
}
}
],
"card_platform": {
"platform": {
"audience": {
"name": "production"
},
"device": {
"name": "iPhone",
"version": "13"
}
}
},
"name": "summary",
"url": "https://t.co/NLPGxqbP2x",
"user_refs_results": [
{
"rest_id": "808633423300624384",
"result": {
"__typename": "User",
"action_counts": {
"favorites_count": 987
},
"avatar": {
"image_url": "https://pbs.twimg.com/profile_images/1365352170267299840/IzvjKckL_normal.jpg"
},
"banner": {
"image_url": "https://pbs.twimg.com/profile_banners/808633423300624384/1481635469"
},
"core": {
"created_at": "Tue Dec 13 11:23:26 +0000 2016",
"name": "arXiv.org",
"screen_name": "arxiv"
},
"dm_permissions": {
"can_dm": true
},
"exclusive_tweet_following": false,
"follow_request_sent": false,
"identity_profile_labels_highlighted_label": {},
"location": {
"location": "Ithaca, NY"
},
"media_permissions": {
"can_media_tag": true
},
"notifications_settings": {
"notifications_enabled": false
},
"pinned_items": {
"tweet_ids_str": []
},
"privacy": {
"protected": false,
"suspended": false
},
"private_super_following": false,
"profile_bio": {
"description": "News from https://t.co/enurGFxpcS, a free distribution service and an open archive for scholarly articles.\n\nFor help with arXiv, see https://t.co/LcWuhM0BOl",
"entities": {
"description": {
"hashtags": [],
"symbols": [],
"urls": [
{
"display_url": "arXiv.org",
"expanded_url": "http://arXiv.org",
"indices": [
10,
33
],
"url": "https://t.co/enurGFxpcS"
},
{
"display_url": "arxiv.org/help",
"expanded_url": "https://arxiv.org/help",
"indices": [
133,
156
],
"url": "https://t.co/LcWuhM0BOl"
}
],
"user_mentions": []
},
"url": {
"urls": [
{
"display_url": "arxiv.org",
"expanded_url": "https://arxiv.org/",
"indices": [
0,
23
],
"url": "https://t.co/DHMkdi4lF9"
}
]
}
}
},
"profile_image_shape": "Circle",
"profile_metadata": {
"profile_interstitial_type": "",
"profile_link_color": "ABB8C2"
},
"profile_translation": {
"translator_type_enum": "None"
},
"properties": {
"has_extended_profile": true
},
"relationship_counts": {
"followers": 45816,
"following": 184
},
"relationship_perspectives": {
"blocked_by": false,
"blocking": false,
"followed_by": false,
"following": false,
"live_following": false,
"muting": false
},
"rest_id": "808633423300624384",
"smart_blocked_by": false,
"smart_blocking": false,
"super_follow_eligible": false,
"super_followed_by": false,
"super_following": false,
"tweet_counts": {
"media_tweets": 117,
"tweets": 1131
},
"verification": {
"is_blue_verified": false,
"verified": false
},
"website": {
"url": "https://t.co/DHMkdi4lF9"
}
}
}
]
},
"place": {},
"entities": {
"hashtags": [],
"symbols": [],
"timestamps": [],
"urls": [],
"user_mentions": [
{
"id_str": "1777066011579002880",
"indices": [
3,
12
],
"name": "Zhuokai Zhao",
"screen_name": "zhuokaiz"
}
]
},
"quoted_tweet": null,
"retweeted_tweet": {
"type": "tweet",
"id": "2033061315707654255",
"url": "https://x.com/zhuokaiz/status/2033061315707654255",
"twitterUrl": "https://twitter.com/zhuokaiz/status/2033061315707654255",
"text": "Latent world models learn differentiable dynamics in a learned representation space, which should make planning as simple as gradient descent.\n\nBut it almost never works.\n\nWhat I mean is, at test time, you can treat the action sequence as learnable parameters, roll out the frozen world model, measure how far the predicted final state is from the goal, and backprop through the entire unrolled chain to optimize actions directly. Yet many of the systems that work (Dreamer, TD-MPC2, DINO-WM) abandon this and fall back to sampling-based search instead.\n\nThat's why I really like this new paper by @yingwww_, @ylecun, and @mengyer, which gives a clean diagnosis of why, and a principled fix.\n\nThe reason everyone abandons gradient descent on actions is that the planning objective is highly non-convex in the learned latent space. So instead most systems use CEM (cross-entropy method) or MPPI (model predictive path integral), both derivative-free.\n\nCEM samples batches of action sequences, evaluates them by rolling out the world model, keeps the top-k, and refits the sampling distribution.\n\nMPPI does something similar but weights trajectories by exponentiated negative cost instead of hard elite selection.\n\nThese work when gradients are unreliable but the compute cost is substantial — hundreds of candidate rollouts per planning step vs a single forward-backward pass.\n\nThis paper asks what exactly makes the latent planning landscape so hostile to gradients and what you can do about it.\n\nThe diagnosis. Their baseline is DINO-WM, a JEPA-style world model with a ViT predictor planning in frozen DINOv2 feature space, minimizing terminal MSE between predicted and goal embeddings. The problem is that DINOv2 latent trajectories are highly curved (when you use MSE as the planning cost you're implicitly assuming euclidean distance approximates geodesic distance along feasible transitions).\n\nFor curved trajectories this breaks badly, gradient-based planners get trapped and straight-line distances in embedding space misrepresent actual reachability.\n\nThe fix draws from the perceptual straightening hypothesis in neuroscience — the idea that biological visual systems transform complex video into internally straighter representations. So they add a curvature regularizer during world model training.\n\nGiven consecutive encoded states\n\nz_t, z_{t+1}, z_{t+2},\n\ndefine velocity vectors as\n\nv_t = z_{t+1} - z_t\n\nmeasure curvature as the cosine similarity between consecutive velocities, and minimize\n\nL_curv = 1 - cos(v_t, v_{t+1}).\n\nTotal loss is then\n\nL_pred + λ * L_curv\n\nwith stop-gradient on the target branch to prevent collapse.\n\nThe theory backs this up cleanly — they prove that reducing curvature directly bounds how well-conditioned the planning optimization is — straighter latent trajectories guarantee faster convergence of gradient descent over longer horizons.\n\nWorth noting that even without the curvature loss, training the encoder with a prediction objective alone produces some \"implicit straightening\" — the JEPA loss naturally favors representations whose temporal evolution is predictable. Explicit regularization simply pushes this much further.\n\nEmpirical results across four 2D goal-reaching environments are consistently strong. Open-loop success improves by 20-50%, and the GD with straightening matches or beats CEM at a fraction of the compute. \n\nThe most convincing evidence is the distance heatmaps: after straightening, latent Euclidean distance closely matches the shortest distance between states, even though the model was trained only on suboptimal random trajectories.\n\nWhat I find interesting beyond the specific method is that the planning algorithm didn't change. The dynamics model didn't change. A single regularization term on the embedding geometry turned gradient descent from unreliable to competitive with sampling methods. \n\nThe field has largely treated representation learning and planning as separate concerns — learn good features, then figure out how to plan in them. \n\nThis paper makes a concrete case that the representation geometry is itself the bottleneck.\n\nThis connects to a broader pattern in ML. When optimization fails, the instinct is to fix the optimizer (better search, more samples, adaptive schedules). But often the real lever is the shape of the space you're optimizing in.\n\nSame principle shows up in RL post-training where reward landscape shaping matters as much as the algorithm itself. \n\nShape the space so simple optimization works, rather than building complex optimization to handle a bad space.\n\nTheir paper:\nhttps://t.co/NLPGxqbP2x",
"source": "Twitter for iPhone",
"retweetCount": 15,
"replyCount": 4,
"likeCount": 132,
"quoteCount": 3,
"viewCount": 10136,
"createdAt": "Sun Mar 15 06:02:44 +0000 2026",
"lang": "en",
"bookmarkCount": 133,
"isReply": false,
"inReplyToId": null,
"conversationId": "2033061315707654255",
"displayTextRange": [
0,
280
],
"inReplyToUserId": null,
"inReplyToUsername": null,
"author": {
"type": "user",
"userName": "zhuokaiz",
"url": "https://x.com/zhuokaiz",
"twitterUrl": "https://twitter.com/zhuokaiz",
"id": "1777066011579002880",
"name": "Zhuokai Zhao",
"isVerified": false,
"isBlueVerified": true,
"verifiedType": null,
"profilePicture": "https://pbs.twimg.com/profile_images/1777066097822347265/Qb8wV0zd_normal.jpg",
"coverPicture": "https://pbs.twimg.com/profile_banners/1777066011579002880/1721830339",
"description": "",
"location": "New York City, NY",
"followers": 3241,
"following": 334,
"status": "",
"canDm": true,
"canMediaTag": true,
"createdAt": "Sun Apr 07 20:08:53 +0000 2024",
"entities": {
"description": {
"urls": []
},
"url": {}
},
"fastFollowersCount": 0,
"favouritesCount": 520,
"hasCustomTimelines": true,
"isTranslator": false,
"mediaCount": 34,
"statusesCount": 213,
"withheldInCountries": [],
"affiliatesHighlightedLabel": {},
"possiblySensitive": false,
"pinnedTweetIds": [
"2020741871107031074"
],
"profile_bio": {
"description": "AI Research Scientist @Meta. Building scalable intelligence. PhD @UChicagoCS.",
"entities": {
"description": {
"hashtags": [],
"symbols": [],
"urls": [],
"user_mentions": [
{
"id_str": "0",
"indices": [
22,
27
],
"name": "",
"screen_name": "Meta"
},
{
"id_str": "0",
"indices": [
65,
76
],
"name": "",
"screen_name": "UChicagoCS"
}
]
},
"url": {
"urls": [
{
"display_url": "zhuokai-zhao.com",
"expanded_url": "https://zhuokai-zhao.com/",
"indices": [
0,
23
],
"url": "https://t.co/KKddPkvrip"
}
]
}
}
},
"isAutomated": false,
"automatedBy": null
},
"extendedEntities": {},
"card": {
"binding_values": [
{
"key": "thumbnail_image",
"value": {
"image_value": {
"alt": "arXiv logo",
"height": 144,
"url": "https://pbs.twimg.com/card_img/2031482977276878848/A3K92F3y?format=jpg&name=144x144_2",
"width": 144
}
}
},
{
"key": "description",
"value": {
"string_value": "Learning good representations is essential for latent planning with world models. While pretrained visual encoders produce strong semantic visual features, they are not tailored to planning and..."
}
},
{
"key": "domain",
"value": {
"string_value": "arxiv.org"
}
},
{
"key": "thumbnail_image_large",
"value": {
"image_value": {
"alt": "arXiv logo",
"height": 420,
"url": "https://pbs.twimg.com/card_img/2031482977276878848/A3K92F3y?format=jpg&name=420x420_2",
"width": 420
}
}
},
{
"key": "thumbnail_image_original",
"value": {
"image_value": {
"alt": "arXiv logo",
"height": 1000,
"url": "https://pbs.twimg.com/card_img/2031482977276878848/A3K92F3y?format=jpg&name=orig",
"width": 1000
}
}
},
{
"key": "site",
"value": {
"scribe_key": "publisher_id",
"user_value": {
"id_str": "808633423300624384",
"path": []
}
}
},
{
"key": "thumbnail_image_small",
"value": {
"image_value": {
"alt": "arXiv logo",
"height": 100,
"url": "https://pbs.twimg.com/card_img/2031482977276878848/A3K92F3y?format=jpg&name=100x100_2",
"width": 100
}
}
},
{
"key": "thumbnail_image_x_large",
"value": {
"image_value": {
"alt": "arXiv logo",
"height": 1000,
"url": "https://pbs.twimg.com/card_img/2031482977276878848/A3K92F3y?format=png&name=2048x2048_2_exp",
"width": 1000
}
}
},
{
"key": "thumbnail_image_alt_text",
"value": {
"string_value": "arXiv logo"
}
},
{
"key": "vanity_url",
"value": {
"scribe_key": "vanity_url",
"string_value": "arxiv.org"
}
},
{
"key": "thumbnail_image_color",
"value": {
"image_color_value": {
"palette": [
{
"percentage": 94.17,
"rgb": {
"blue": 255,
"green": 255,
"red": 255
}
},
{
"percentage": 4.33,
"rgb": {
"blue": 105,
"green": 116,
"red": 124
}
},
{
"percentage": 1.26,
"rgb": {
"blue": 46,
"green": 21,
"red": 170
}
},
{
"percentage": 0.23,
"rgb": {
"blue": 131,
"green": 116,
"red": 203
}
}
]
}
}
},
{
"key": "title",
"value": {
"string_value": "Temporal Straightening for Latent Planning"
}
},
{
"key": "card_url",
"value": {
"scribe_key": "card_url",
"string_value": "https://t.co/NLPGxqbP2x"
}
}
],
"card_platform": {
"platform": {
"audience": {
"name": "production"
},
"device": {
"name": "iPhone",
"version": "13"
}
}
},
"name": "summary",
"url": "https://t.co/NLPGxqbP2x",
"user_refs_results": [
{
"rest_id": "808633423300624384",
"result": {
"__typename": "User",
"action_counts": {
"favorites_count": 987
},
"avatar": {
"image_url": "https://pbs.twimg.com/profile_images/1365352170267299840/IzvjKckL_normal.jpg"
},
"banner": {
"image_url": "https://pbs.twimg.com/profile_banners/808633423300624384/1481635469"
},
"core": {
"created_at": "Tue Dec 13 11:23:26 +0000 2016",
"name": "arXiv.org",
"screen_name": "arxiv"
},
"dm_permissions": {
"can_dm": true
},
"exclusive_tweet_following": false,
"follow_request_sent": false,
"identity_profile_labels_highlighted_label": {},
"location": {
"location": "Ithaca, NY"
},
"media_permissions": {
"can_media_tag": true
},
"notifications_settings": {
"notifications_enabled": false
},
"pinned_items": {
"tweet_ids_str": []
},
"privacy": {
"protected": false,
"suspended": false
},
"private_super_following": false,
"profile_bio": {
"description": "News from https://t.co/enurGFxpcS, a free distribution service and an open archive for scholarly articles.\n\nFor help with arXiv, see https://t.co/LcWuhM0BOl",
"entities": {
"description": {
"hashtags": [],
"symbols": [],
"urls": [
{
"display_url": "arXiv.org",
"expanded_url": "http://arXiv.org",
"indices": [
10,
33
],
"url": "https://t.co/enurGFxpcS"
},
{
"display_url": "arxiv.org/help",
"expanded_url": "https://arxiv.org/help",
"indices": [
133,
156
],
"url": "https://t.co/LcWuhM0BOl"
}
],
"user_mentions": []
},
"url": {
"urls": [
{
"display_url": "arxiv.org",
"expanded_url": "https://arxiv.org/",
"indices": [
0,
23
],
"url": "https://t.co/DHMkdi4lF9"
}
]
}
}
},
"profile_image_shape": "Circle",
"profile_metadata": {
"profile_interstitial_type": "",
"profile_link_color": "ABB8C2"
},
"profile_translation": {
"translator_type_enum": "None"
},
"properties": {
"has_extended_profile": true
},
"relationship_counts": {
"followers": 45816,
"following": 184
},
"relationship_perspectives": {
"blocked_by": false,
"blocking": false,
"followed_by": false,
"following": false,
"live_following": false,
"muting": false
},
"rest_id": "808633423300624384",
"smart_blocked_by": false,
"smart_blocking": false,
"super_follow_eligible": false,
"super_followed_by": false,
"super_following": false,
"tweet_counts": {
"media_tweets": 117,
"tweets": 1131
},
"verification": {
"is_blue_verified": false,
"verified": false
},
"website": {
"url": "https://t.co/DHMkdi4lF9"
}
}
}
]
},
"place": {},
"entities": {
"hashtags": [],
"symbols": [],
"urls": [
{
"display_url": "arxiv.org/abs/2603.12231",
"expanded_url": "https://arxiv.org/abs/2603.12231",
"indices": [
4596,
4619
],
"url": "https://t.co/NLPGxqbP2x"
}
],
"user_mentions": [
{
"id_str": "1706062297934733312",
"indices": [
598,
607
],
"name": "Ying Wang",
"screen_name": "yingwww_"
},
{
"id_str": "48008938",
"indices": [
609,
616
],
"name": "Yann LeCun",
"screen_name": "ylecun"
},
{
"id_str": "56113666",
"indices": [
622,
630
],
"name": "Mengye Ren",
"screen_name": "mengyer"
}
]
},
"quoted_tweet": null,
"retweeted_tweet": null,
"isLimitedReply": false,
"article": null
},
"isLimitedReply": false,
"article": null
}