Revert kvec msg iterator before trying to process a TLS alert
when possible.
In nvmet_tcp_try_recv_data(), it's assumed that no msg control
message buffer is set prior to sock_recvmsg(). If no control
message structure is setup, kTLS layer will read and process
TLS data record types. As soon as it encounters a TLS control
message, it would return an error. At that point, we setup a kvec
backed control buffer and read in the control message such as
a TLS alert. Msg can advance the kvec pointer as a part of the
copy process thus we need to revert the iterator before calling
into the tls_alert_recv.
Fixes: a1c5dd8355b1 ("nvmet-tcp: control messages for recvmsg()")
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Olga Kornievskaia <okorniev(a)redhat.com>
---
drivers/nvme/target/tcp.c | 48 +++++++++++++++++++++++++++++++--------
1 file changed, 38 insertions(+), 10 deletions(-)
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 688033b88d38..cf3336ddc9a3 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1161,6 +1161,7 @@ static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue)
if (unlikely(len < 0))
return len;
if (queue->tls_pskid) {
+ iov_iter_revert(&msg.msg_iter, len);
ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf);
if (ret < 0)
return ret;
@@ -1218,18 +1219,47 @@ static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
{
struct nvmet_tcp_cmd *cmd = queue->cmd;
int len, ret;
+ union {
+ struct cmsghdr cmsg;
+ u8 buf[CMSG_SPACE(sizeof(u8))];
+ } u;
+ u8 alert[2];
+ struct kvec alert_kvec = {
+ .iov_base = alert,
+ .iov_len = sizeof(alert),
+ };
+ struct msghdr msg = {
+ .msg_control = &u,
+ .msg_controllen = sizeof(u),
+ };
while (msg_data_left(&cmd->recv_msg)) {
+ /* assumed that cmg->recv_msg's control buffer is not setup
+ */
+ WARN_ON_ONCE(cmd->recv_msg.msg_controllen > 0);
+
len = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
cmd->recv_msg.msg_flags);
+ if (cmd->recv_msg.msg_flags & MSG_CTRUNC) {
+ cmd->recv_msg.msg_flags &= ~(MSG_CTRUNC | MSG_EOF);
+ if (len == 0 || len == -EIO) {
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec,
+ 1, alert_kvec.iov_len);
+ len = sock_recvmsg(cmd->queue->sock, &msg,
+ MSG_DONTWAIT);
+ if (len > 0 &&
+ tls_get_record_type(cmd->queue->sock->sk,
+ &u.cmsg) == TLS_RECORD_TYPE_ALERT) {
+ iov_iter_revert(&msg.msg_iter, len);
+ ret = nvmet_tcp_tls_record_ok(cmd->queue,
+ &msg, u.buf);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ }
if (len <= 0)
return len;
- if (queue->tls_pskid) {
- ret = nvmet_tcp_tls_record_ok(cmd->queue,
- &cmd->recv_msg, cmd->recv_cbuf);
- if (ret < 0)
- return ret;
- }
cmd->pdu_recv += len;
cmd->rbytes_done += len;
@@ -1267,6 +1297,7 @@ static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
if (unlikely(len < 0))
return len;
if (queue->tls_pskid) {
+ iov_iter_revert(&msg.msg_iter, len);
ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf);
if (ret < 0)
return ret;
@@ -1453,10 +1484,6 @@ static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
if (!c->r2t_pdu)
goto out_free_data;
- if (queue->state == NVMET_TCP_Q_TLS_HANDSHAKE) {
- c->recv_msg.msg_control = c->recv_cbuf;
- c->recv_msg.msg_controllen = sizeof(c->recv_cbuf);
- }
c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
list_add_tail(&c->entry, &queue->free_list);
@@ -1736,6 +1763,7 @@ static int nvmet_tcp_try_peek_pdu(struct nvmet_tcp_queue *queue)
return len;
}
+ iov_iter_revert(&msg.msg_iter, len);
ret = nvmet_tcp_tls_record_ok(queue, &msg, cbuf);
if (ret < 0)
return ret;
--
2.47.1
A security exploit was discovered in NFS over TLS in tls_alert_recv
due to its assumption that there is valid data in the msghdr's
iterator's kvec.
Instead, this patch proposes the rework how control messages are
setup and used by sock_recvmsg().
If no control message structure is setup, kTLS layer will read and
process TLS data record types. As soon as it encounters a TLS control
message, it would return an error. At that point, NFS can setup a kvec
backed control buffer and read in the control message such as a TLS
alert. Msg iterator can advance the kvec pointer as a part of the copy
process thus we need to revert the iterator before calling into the
tls_alert_recv.
Fixes: dea034b963c8 ("SUNRPC: Capture CMSG metadata on client-side receive")
Suggested-by: Trond Myklebust <trondmy(a)hammerspace.com>
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Olga Kornievskaia <okorniev(a)redhat.com>
---
net/sunrpc/xprtsock.c | 40 ++++++++++++++++++++++++++++++----------
1 file changed, 30 insertions(+), 10 deletions(-)
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 04ff66758fc3..c5f7bbf5775f 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -358,7 +358,7 @@ xs_alloc_sparse_pages(struct xdr_buf *buf, size_t want, gfp_t gfp)
static int
xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg,
- struct cmsghdr *cmsg, int ret)
+ unsigned int *msg_flags, struct cmsghdr *cmsg, int ret)
{
u8 content_type = tls_get_record_type(sock->sk, cmsg);
u8 level, description;
@@ -371,7 +371,7 @@ xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg,
* record, even though there might be more frames
* waiting to be decrypted.
*/
- msg->msg_flags &= ~MSG_EOR;
+ *msg_flags &= ~MSG_EOR;
break;
case TLS_RECORD_TYPE_ALERT:
tls_alert_recv(sock->sk, msg, &level, &description);
@@ -386,19 +386,33 @@ xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg,
}
static int
-xs_sock_recv_cmsg(struct socket *sock, struct msghdr *msg, int flags)
+xs_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags, int flags)
{
union {
struct cmsghdr cmsg;
u8 buf[CMSG_SPACE(sizeof(u8))];
} u;
+ u8 alert[2];
+ struct kvec alert_kvec = {
+ .iov_base = alert,
+ .iov_len = sizeof(alert),
+ };
+ struct msghdr msg = {
+ .msg_flags = *msg_flags,
+ .msg_control = &u,
+ .msg_controllen = sizeof(u),
+ };
int ret;
- msg->msg_control = &u;
- msg->msg_controllen = sizeof(u);
- ret = sock_recvmsg(sock, msg, flags);
- if (msg->msg_controllen != sizeof(u))
- ret = xs_sock_process_cmsg(sock, msg, &u.cmsg, ret);
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1,
+ alert_kvec.iov_len);
+ ret = sock_recvmsg(sock, &msg, flags);
+ if (ret > 0 &&
+ tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) {
+ iov_iter_revert(&msg.msg_iter, ret);
+ ret = xs_sock_process_cmsg(sock, &msg, msg_flags, &u.cmsg,
+ -EAGAIN);
+ }
return ret;
}
@@ -408,7 +422,13 @@ xs_sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags, size_t seek)
ssize_t ret;
if (seek != 0)
iov_iter_advance(&msg->msg_iter, seek);
- ret = xs_sock_recv_cmsg(sock, msg, flags);
+ ret = sock_recvmsg(sock, msg, flags);
+ /* Handle TLS inband control message lazily */
+ if (msg->msg_flags & MSG_CTRUNC) {
+ msg->msg_flags &= ~(MSG_CTRUNC | MSG_EOR);
+ if (ret == 0 || ret == -EIO)
+ ret = xs_sock_recv_cmsg(sock, &msg->msg_flags, flags);
+ }
return ret > 0 ? ret + seek : ret;
}
@@ -434,7 +454,7 @@ xs_read_discard(struct socket *sock, struct msghdr *msg, int flags,
size_t count)
{
iov_iter_discard(&msg->msg_iter, ITER_DEST, count);
- return xs_sock_recv_cmsg(sock, msg, flags);
+ return xs_sock_recvmsg(sock, msg, flags, 0);
}
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
--
2.47.1
Scott Mayhew discovered a security exploit in NFS over TLS in
tls_alert_recv() due to its assumption it can read data from
the msg iterator's kvec..
kTLS implementation splits TLS non-data record payload between
the control message buffer (which includes the type such as TLS
aler or TLS cipher change) and the rest of the payload (say TLS
alert's level/description) which goes into the msg payload buffer.
This patch proposes to rework how control messages are setup and
used by sock_recvmsg().
If no control message structure is setup, kTLS layer will read and
process TLS data record types. As soon as it encounters a TLS control
message, it would return an error. At that point, NFS can setup a
kvec backed msg buffer and read in the control message such as a
TLS alert. Msg iterator can advance the kvec pointer as a part of
the copy process thus we need to revert the iterator before calling
into the tls_alert_recv.
Reported-by: Scott Mayhew <smayhew(a)redhat.com>
Fixes: 5e052dda121e ("SUNRPC: Recognize control messages in server-side TCP socket code")
Suggested-by: Trond Myklebust <trondmy(a)hammerspace.com>
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Olga Kornievskaia <okorniev(a)redhat.com>
---
net/sunrpc/svcsock.c | 43 +++++++++++++++++++++++++++++++++++--------
1 file changed, 35 insertions(+), 8 deletions(-)
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 46c156b121db..e2c5e0e626f9 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -257,20 +257,47 @@ svc_tcp_sock_process_cmsg(struct socket *sock, struct msghdr *msg,
}
static int
-svc_tcp_sock_recv_cmsg(struct svc_sock *svsk, struct msghdr *msg)
+svc_tcp_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags)
{
union {
struct cmsghdr cmsg;
u8 buf[CMSG_SPACE(sizeof(u8))];
} u;
- struct socket *sock = svsk->sk_sock;
+ u8 alert[2];
+ struct kvec alert_kvec = {
+ .iov_base = alert,
+ .iov_len = sizeof(alert),
+ };
+ struct msghdr msg = {
+ .msg_flags = *msg_flags,
+ .msg_control = &u,
+ .msg_controllen = sizeof(u),
+ };
+ int ret;
+
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1,
+ alert_kvec.iov_len);
+ ret = sock_recvmsg(sock, &msg, MSG_DONTWAIT);
+ if (ret > 0 &&
+ tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) {
+ iov_iter_revert(&msg.msg_iter, ret);
+ ret = svc_tcp_sock_process_cmsg(sock, &msg, &u.cmsg, -EAGAIN);
+ }
+ return ret;
+}
+
+static int
+svc_tcp_sock_recvmsg(struct svc_sock *svsk, struct msghdr *msg)
+{
int ret;
+ struct socket *sock = svsk->sk_sock;
- msg->msg_control = &u;
- msg->msg_controllen = sizeof(u);
ret = sock_recvmsg(sock, msg, MSG_DONTWAIT);
- if (unlikely(msg->msg_controllen != sizeof(u)))
- ret = svc_tcp_sock_process_cmsg(sock, msg, &u.cmsg, ret);
+ if (msg->msg_flags & MSG_CTRUNC) {
+ msg->msg_flags &= ~(MSG_CTRUNC | MSG_EOR);
+ if (ret == 0 || ret == -EIO)
+ ret = svc_tcp_sock_recv_cmsg(sock, &msg->msg_flags);
+ }
return ret;
}
@@ -321,7 +348,7 @@ static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen,
iov_iter_advance(&msg.msg_iter, seek);
buflen -= seek;
}
- len = svc_tcp_sock_recv_cmsg(svsk, &msg);
+ len = svc_tcp_sock_recvmsg(svsk, &msg);
if (len > 0)
svc_flush_bvec(bvec, len, seek);
@@ -1018,7 +1045,7 @@ static ssize_t svc_tcp_read_marker(struct svc_sock *svsk,
iov.iov_base = ((char *)&svsk->sk_marker) + svsk->sk_tcplen;
iov.iov_len = want;
iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, want);
- len = svc_tcp_sock_recv_cmsg(svsk, &msg);
+ len = svc_tcp_sock_recvmsg(svsk, &msg);
if (len < 0)
return len;
svsk->sk_tcplen += len;
--
2.47.1
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 15f77764e90a713ee3916ca424757688e4f565b9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025072854-humbling-collector-aca6@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 15f77764e90a713ee3916ca424757688e4f565b9 Mon Sep 17 00:00:00 2001
From: "Lin.Cao" <lincao12(a)amd.com>
Date: Thu, 17 Jul 2025 16:44:53 +0800
Subject: [PATCH] drm/sched: Remove optimization that causes hang when killing
dependent jobs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
When application A submits jobs and application B submits a job with a
dependency on A's fence, the normal flow wakes up the scheduler after
processing each job. However, the optimization in
drm_sched_entity_add_dependency_cb() uses a callback that only clears
dependencies without waking up the scheduler.
When application A is killed before its jobs can run, the callback gets
triggered but only clears the dependency without waking up the scheduler,
causing the scheduler to enter sleep state and application B to hang.
Remove the optimization by deleting drm_sched_entity_clear_dep() and its
usage, ensuring the scheduler is always woken up when dependencies are
cleared.
Fixes: 777dbd458c89 ("drm/amdgpu: drop a dummy wakeup scheduler")
Cc: stable(a)vger.kernel.org # v4.6+
Signed-off-by: Lin.Cao <lincao12(a)amd.com>
Reviewed-by: Christian König <christian.koenig(a)amd.com>
Signed-off-by: Philipp Stanner <phasta(a)kernel.org>
Link: https://lore.kernel.org/r/20250717084453.921097-1-lincao12@amd.com
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index e671aa241720..ac678de7fe5e 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -355,17 +355,6 @@ void drm_sched_entity_destroy(struct drm_sched_entity *entity)
}
EXPORT_SYMBOL(drm_sched_entity_destroy);
-/* drm_sched_entity_clear_dep - callback to clear the entities dependency */
-static void drm_sched_entity_clear_dep(struct dma_fence *f,
- struct dma_fence_cb *cb)
-{
- struct drm_sched_entity *entity =
- container_of(cb, struct drm_sched_entity, cb);
-
- entity->dependency = NULL;
- dma_fence_put(f);
-}
-
/*
* drm_sched_entity_wakeup - callback to clear the entity's dependency and
* wake up the scheduler
@@ -376,7 +365,8 @@ static void drm_sched_entity_wakeup(struct dma_fence *f,
struct drm_sched_entity *entity =
container_of(cb, struct drm_sched_entity, cb);
- drm_sched_entity_clear_dep(f, cb);
+ entity->dependency = NULL;
+ dma_fence_put(f);
drm_sched_wakeup(entity->rq->sched);
}
@@ -429,13 +419,6 @@ static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity)
fence = dma_fence_get(&s_fence->scheduled);
dma_fence_put(entity->dependency);
entity->dependency = fence;
- if (!dma_fence_add_callback(fence, &entity->cb,
- drm_sched_entity_clear_dep))
- return true;
-
- /* Ignore it when it is already scheduled */
- dma_fence_put(fence);
- return false;
}
if (!dma_fence_add_callback(entity->dependency, &entity->cb,
A security exploit was discovered in NFS over TLS in tls_alert_recv
due to its assumption that there is valid data in the msghdr's
iterator's kvec.
Instead, this patch proposes the rework how control messages are
setup and used by sock_recvmsg().
If no control message structure is setup, kTLS layer will read and
process TLS data record types. As soon as it encounters a TLS control
message, it would return an error. At that point, NFS can setup a kvec
backed control buffer and read in the control message such as a TLS
alert. Msg iterator can advance the kvec pointer as a part of the copy
process thus we need to revert the iterator before calling into the
tls_alert_recv.
Fixes: dea034b963c8 ("SUNRPC: Capture CMSG metadata on client-side receive")
Suggested-by: Trond Myklebust <trondmy(a)hammerspace.com>
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Olga Kornievskaia <okorniev(a)redhat.com>
---
net/sunrpc/xprtsock.c | 40 ++++++++++++++++++++++++++++++----------
1 file changed, 30 insertions(+), 10 deletions(-)
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 04ff66758fc3..c5f7bbf5775f 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -358,7 +358,7 @@ xs_alloc_sparse_pages(struct xdr_buf *buf, size_t want, gfp_t gfp)
static int
xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg,
- struct cmsghdr *cmsg, int ret)
+ unsigned int *msg_flags, struct cmsghdr *cmsg, int ret)
{
u8 content_type = tls_get_record_type(sock->sk, cmsg);
u8 level, description;
@@ -371,7 +371,7 @@ xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg,
* record, even though there might be more frames
* waiting to be decrypted.
*/
- msg->msg_flags &= ~MSG_EOR;
+ *msg_flags &= ~MSG_EOR;
break;
case TLS_RECORD_TYPE_ALERT:
tls_alert_recv(sock->sk, msg, &level, &description);
@@ -386,19 +386,33 @@ xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg,
}
static int
-xs_sock_recv_cmsg(struct socket *sock, struct msghdr *msg, int flags)
+xs_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags, int flags)
{
union {
struct cmsghdr cmsg;
u8 buf[CMSG_SPACE(sizeof(u8))];
} u;
+ u8 alert[2];
+ struct kvec alert_kvec = {
+ .iov_base = alert,
+ .iov_len = sizeof(alert),
+ };
+ struct msghdr msg = {
+ .msg_flags = *msg_flags,
+ .msg_control = &u,
+ .msg_controllen = sizeof(u),
+ };
int ret;
- msg->msg_control = &u;
- msg->msg_controllen = sizeof(u);
- ret = sock_recvmsg(sock, msg, flags);
- if (msg->msg_controllen != sizeof(u))
- ret = xs_sock_process_cmsg(sock, msg, &u.cmsg, ret);
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1,
+ alert_kvec.iov_len);
+ ret = sock_recvmsg(sock, &msg, flags);
+ if (ret > 0 &&
+ tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) {
+ iov_iter_revert(&msg.msg_iter, ret);
+ ret = xs_sock_process_cmsg(sock, &msg, msg_flags, &u.cmsg,
+ -EAGAIN);
+ }
return ret;
}
@@ -408,7 +422,13 @@ xs_sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags, size_t seek)
ssize_t ret;
if (seek != 0)
iov_iter_advance(&msg->msg_iter, seek);
- ret = xs_sock_recv_cmsg(sock, msg, flags);
+ ret = sock_recvmsg(sock, msg, flags);
+ /* Handle TLS inband control message lazily */
+ if (msg->msg_flags & MSG_CTRUNC) {
+ msg->msg_flags &= ~(MSG_CTRUNC | MSG_EOR);
+ if (ret == 0 || ret == -EIO)
+ ret = xs_sock_recv_cmsg(sock, &msg->msg_flags, flags);
+ }
return ret > 0 ? ret + seek : ret;
}
@@ -434,7 +454,7 @@ xs_read_discard(struct socket *sock, struct msghdr *msg, int flags,
size_t count)
{
iov_iter_discard(&msg->msg_iter, ITER_DEST, count);
- return xs_sock_recv_cmsg(sock, msg, flags);
+ return xs_sock_recvmsg(sock, msg, flags, 0);
}
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
--
2.47.1
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 15f77764e90a713ee3916ca424757688e4f565b9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025072854-backlog-freeness-4f50@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 15f77764e90a713ee3916ca424757688e4f565b9 Mon Sep 17 00:00:00 2001
From: "Lin.Cao" <lincao12(a)amd.com>
Date: Thu, 17 Jul 2025 16:44:53 +0800
Subject: [PATCH] drm/sched: Remove optimization that causes hang when killing
dependent jobs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
When application A submits jobs and application B submits a job with a
dependency on A's fence, the normal flow wakes up the scheduler after
processing each job. However, the optimization in
drm_sched_entity_add_dependency_cb() uses a callback that only clears
dependencies without waking up the scheduler.
When application A is killed before its jobs can run, the callback gets
triggered but only clears the dependency without waking up the scheduler,
causing the scheduler to enter sleep state and application B to hang.
Remove the optimization by deleting drm_sched_entity_clear_dep() and its
usage, ensuring the scheduler is always woken up when dependencies are
cleared.
Fixes: 777dbd458c89 ("drm/amdgpu: drop a dummy wakeup scheduler")
Cc: stable(a)vger.kernel.org # v4.6+
Signed-off-by: Lin.Cao <lincao12(a)amd.com>
Reviewed-by: Christian König <christian.koenig(a)amd.com>
Signed-off-by: Philipp Stanner <phasta(a)kernel.org>
Link: https://lore.kernel.org/r/20250717084453.921097-1-lincao12@amd.com
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index e671aa241720..ac678de7fe5e 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -355,17 +355,6 @@ void drm_sched_entity_destroy(struct drm_sched_entity *entity)
}
EXPORT_SYMBOL(drm_sched_entity_destroy);
-/* drm_sched_entity_clear_dep - callback to clear the entities dependency */
-static void drm_sched_entity_clear_dep(struct dma_fence *f,
- struct dma_fence_cb *cb)
-{
- struct drm_sched_entity *entity =
- container_of(cb, struct drm_sched_entity, cb);
-
- entity->dependency = NULL;
- dma_fence_put(f);
-}
-
/*
* drm_sched_entity_wakeup - callback to clear the entity's dependency and
* wake up the scheduler
@@ -376,7 +365,8 @@ static void drm_sched_entity_wakeup(struct dma_fence *f,
struct drm_sched_entity *entity =
container_of(cb, struct drm_sched_entity, cb);
- drm_sched_entity_clear_dep(f, cb);
+ entity->dependency = NULL;
+ dma_fence_put(f);
drm_sched_wakeup(entity->rq->sched);
}
@@ -429,13 +419,6 @@ static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity)
fence = dma_fence_get(&s_fence->scheduled);
dma_fence_put(entity->dependency);
entity->dependency = fence;
- if (!dma_fence_add_callback(fence, &entity->cb,
- drm_sched_entity_clear_dep))
- return true;
-
- /* Ignore it when it is already scheduled */
- dma_fence_put(fence);
- return false;
}
if (!dma_fence_add_callback(entity->dependency, &entity->cb,
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 15f77764e90a713ee3916ca424757688e4f565b9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025072853-startup-unwomanly-7e1c@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 15f77764e90a713ee3916ca424757688e4f565b9 Mon Sep 17 00:00:00 2001
From: "Lin.Cao" <lincao12(a)amd.com>
Date: Thu, 17 Jul 2025 16:44:53 +0800
Subject: [PATCH] drm/sched: Remove optimization that causes hang when killing
dependent jobs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
When application A submits jobs and application B submits a job with a
dependency on A's fence, the normal flow wakes up the scheduler after
processing each job. However, the optimization in
drm_sched_entity_add_dependency_cb() uses a callback that only clears
dependencies without waking up the scheduler.
When application A is killed before its jobs can run, the callback gets
triggered but only clears the dependency without waking up the scheduler,
causing the scheduler to enter sleep state and application B to hang.
Remove the optimization by deleting drm_sched_entity_clear_dep() and its
usage, ensuring the scheduler is always woken up when dependencies are
cleared.
Fixes: 777dbd458c89 ("drm/amdgpu: drop a dummy wakeup scheduler")
Cc: stable(a)vger.kernel.org # v4.6+
Signed-off-by: Lin.Cao <lincao12(a)amd.com>
Reviewed-by: Christian König <christian.koenig(a)amd.com>
Signed-off-by: Philipp Stanner <phasta(a)kernel.org>
Link: https://lore.kernel.org/r/20250717084453.921097-1-lincao12@amd.com
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index e671aa241720..ac678de7fe5e 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -355,17 +355,6 @@ void drm_sched_entity_destroy(struct drm_sched_entity *entity)
}
EXPORT_SYMBOL(drm_sched_entity_destroy);
-/* drm_sched_entity_clear_dep - callback to clear the entities dependency */
-static void drm_sched_entity_clear_dep(struct dma_fence *f,
- struct dma_fence_cb *cb)
-{
- struct drm_sched_entity *entity =
- container_of(cb, struct drm_sched_entity, cb);
-
- entity->dependency = NULL;
- dma_fence_put(f);
-}
-
/*
* drm_sched_entity_wakeup - callback to clear the entity's dependency and
* wake up the scheduler
@@ -376,7 +365,8 @@ static void drm_sched_entity_wakeup(struct dma_fence *f,
struct drm_sched_entity *entity =
container_of(cb, struct drm_sched_entity, cb);
- drm_sched_entity_clear_dep(f, cb);
+ entity->dependency = NULL;
+ dma_fence_put(f);
drm_sched_wakeup(entity->rq->sched);
}
@@ -429,13 +419,6 @@ static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity)
fence = dma_fence_get(&s_fence->scheduled);
dma_fence_put(entity->dependency);
entity->dependency = fence;
- if (!dma_fence_add_callback(fence, &entity->cb,
- drm_sched_entity_clear_dep))
- return true;
-
- /* Ignore it when it is already scheduled */
- dma_fence_put(fence);
- return false;
}
if (!dma_fence_add_callback(entity->dependency, &entity->cb,
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 15f77764e90a713ee3916ca424757688e4f565b9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025072852-shrivel-theorize-77f8@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 15f77764e90a713ee3916ca424757688e4f565b9 Mon Sep 17 00:00:00 2001
From: "Lin.Cao" <lincao12(a)amd.com>
Date: Thu, 17 Jul 2025 16:44:53 +0800
Subject: [PATCH] drm/sched: Remove optimization that causes hang when killing
dependent jobs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
When application A submits jobs and application B submits a job with a
dependency on A's fence, the normal flow wakes up the scheduler after
processing each job. However, the optimization in
drm_sched_entity_add_dependency_cb() uses a callback that only clears
dependencies without waking up the scheduler.
When application A is killed before its jobs can run, the callback gets
triggered but only clears the dependency without waking up the scheduler,
causing the scheduler to enter sleep state and application B to hang.
Remove the optimization by deleting drm_sched_entity_clear_dep() and its
usage, ensuring the scheduler is always woken up when dependencies are
cleared.
Fixes: 777dbd458c89 ("drm/amdgpu: drop a dummy wakeup scheduler")
Cc: stable(a)vger.kernel.org # v4.6+
Signed-off-by: Lin.Cao <lincao12(a)amd.com>
Reviewed-by: Christian König <christian.koenig(a)amd.com>
Signed-off-by: Philipp Stanner <phasta(a)kernel.org>
Link: https://lore.kernel.org/r/20250717084453.921097-1-lincao12@amd.com
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
index e671aa241720..ac678de7fe5e 100644
--- a/drivers/gpu/drm/scheduler/sched_entity.c
+++ b/drivers/gpu/drm/scheduler/sched_entity.c
@@ -355,17 +355,6 @@ void drm_sched_entity_destroy(struct drm_sched_entity *entity)
}
EXPORT_SYMBOL(drm_sched_entity_destroy);
-/* drm_sched_entity_clear_dep - callback to clear the entities dependency */
-static void drm_sched_entity_clear_dep(struct dma_fence *f,
- struct dma_fence_cb *cb)
-{
- struct drm_sched_entity *entity =
- container_of(cb, struct drm_sched_entity, cb);
-
- entity->dependency = NULL;
- dma_fence_put(f);
-}
-
/*
* drm_sched_entity_wakeup - callback to clear the entity's dependency and
* wake up the scheduler
@@ -376,7 +365,8 @@ static void drm_sched_entity_wakeup(struct dma_fence *f,
struct drm_sched_entity *entity =
container_of(cb, struct drm_sched_entity, cb);
- drm_sched_entity_clear_dep(f, cb);
+ entity->dependency = NULL;
+ dma_fence_put(f);
drm_sched_wakeup(entity->rq->sched);
}
@@ -429,13 +419,6 @@ static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity)
fence = dma_fence_get(&s_fence->scheduled);
dma_fence_put(entity->dependency);
entity->dependency = fence;
- if (!dma_fence_add_callback(fence, &entity->cb,
- drm_sched_entity_clear_dep))
- return true;
-
- /* Ignore it when it is already scheduled */
- dma_fence_put(fence);
- return false;
}
if (!dma_fence_add_callback(entity->dependency, &entity->cb,
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 694d6b99923eb05a8fd188be44e26077d19f0e21
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025072811-ethanol-arbitrary-a664@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 694d6b99923eb05a8fd188be44e26077d19f0e21 Mon Sep 17 00:00:00 2001
From: Harry Yoo <harry.yoo(a)oracle.com>
Date: Fri, 4 Jul 2025 19:30:53 +0900
Subject: [PATCH] mm/zsmalloc: do not pass __GFP_MOVABLE if CONFIG_COMPACTION=n
Commit 48b4800a1c6a ("zsmalloc: page migration support") added support for
migrating zsmalloc pages using the movable_operations migration framework.
However, the commit did not take into account that zsmalloc supports
migration only when CONFIG_COMPACTION is enabled. Tracing shows that
zsmalloc was still passing the __GFP_MOVABLE flag even when compaction is
not supported.
This can result in unmovable pages being allocated from movable page
blocks (even without stealing page blocks), ZONE_MOVABLE and CMA area.
Possible user visible effects:
- Some ZONE_MOVABLE memory can be not actually movable
- CMA allocation can fail because of this
- Increased memory fragmentation due to ignoring the page mobility
grouping feature
I'm not really sure who uses kernels without compaction support, though :(
To fix this, clear the __GFP_MOVABLE flag when
!IS_ENABLED(CONFIG_COMPACTION).
Link: https://lkml.kernel.org/r/20250704103053.6913-1-harry.yoo@oracle.com
Fixes: 48b4800a1c6a ("zsmalloc: page migration support")
Signed-off-by: Harry Yoo <harry.yoo(a)oracle.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Sergey Senozhatsky <senozhatsky(a)chromium.org>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 999b513c7fdf..f3e2215f95eb 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1043,6 +1043,9 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
if (!zspage)
return NULL;
+ if (!IS_ENABLED(CONFIG_COMPACTION))
+ gfp &= ~__GFP_MOVABLE;
+
zspage->magic = ZSPAGE_MAGIC;
zspage->pool = pool;
zspage->class = class->index;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 694d6b99923eb05a8fd188be44e26077d19f0e21
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025072810-footgear-grumpily-4fd5@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 694d6b99923eb05a8fd188be44e26077d19f0e21 Mon Sep 17 00:00:00 2001
From: Harry Yoo <harry.yoo(a)oracle.com>
Date: Fri, 4 Jul 2025 19:30:53 +0900
Subject: [PATCH] mm/zsmalloc: do not pass __GFP_MOVABLE if CONFIG_COMPACTION=n
Commit 48b4800a1c6a ("zsmalloc: page migration support") added support for
migrating zsmalloc pages using the movable_operations migration framework.
However, the commit did not take into account that zsmalloc supports
migration only when CONFIG_COMPACTION is enabled. Tracing shows that
zsmalloc was still passing the __GFP_MOVABLE flag even when compaction is
not supported.
This can result in unmovable pages being allocated from movable page
blocks (even without stealing page blocks), ZONE_MOVABLE and CMA area.
Possible user visible effects:
- Some ZONE_MOVABLE memory can be not actually movable
- CMA allocation can fail because of this
- Increased memory fragmentation due to ignoring the page mobility
grouping feature
I'm not really sure who uses kernels without compaction support, though :(
To fix this, clear the __GFP_MOVABLE flag when
!IS_ENABLED(CONFIG_COMPACTION).
Link: https://lkml.kernel.org/r/20250704103053.6913-1-harry.yoo@oracle.com
Fixes: 48b4800a1c6a ("zsmalloc: page migration support")
Signed-off-by: Harry Yoo <harry.yoo(a)oracle.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Sergey Senozhatsky <senozhatsky(a)chromium.org>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 999b513c7fdf..f3e2215f95eb 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1043,6 +1043,9 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
if (!zspage)
return NULL;
+ if (!IS_ENABLED(CONFIG_COMPACTION))
+ gfp &= ~__GFP_MOVABLE;
+
zspage->magic = ZSPAGE_MAGIC;
zspage->pool = pool;
zspage->class = class->index;