Skip to content

Commit a0b4c7a

Browse files
dhowellsbrauner
authored andcommitted
netfs: Fix unbuffered/DIO writes to dispatch subrequests in strict sequence
Fix netfslib such that when it's making an unbuffered or DIO write, to make sure that it sends each subrequest strictly sequentially, waiting till the previous one is 'committed' before sending the next so that we don't have pieces landing out of order and potentially leaving a hole if an error occurs (ENOSPC for example). This is done by copying in just those bits of issuing, collecting and retrying subrequests that are necessary to do one subrequest at a time. Retrying, in particular, is simpler because if the current subrequest needs retrying, the source iterator can just be copied again and the subrequest prepped and issued again without needing to be concerned about whether it needs merging with the previous or next in the sequence. Note that the issuing loop waits for a subrequest to complete right after issuing it, but this wait could be moved elsewhere allowing preparatory steps to be performed whilst the subrequest is in progress. In particular, once content encryption is available in netfslib, that could be done whilst waiting, as could cleanup of buffers that have been completed. Fixes: 153a996 ("netfs: Implement unbuffered/DIO write support") Signed-off-by: David Howells <dhowells@redhat.com> Link: https://patch.msgid.link/58526.1772112753@warthog.procyon.org.uk Tested-by: Steve French <sfrench@samba.org> Reviewed-by: Paulo Alcantara (Red Hat) <pc@manguebit.org> cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
1 parent 28aaa9c commit a0b4c7a

5 files changed

Lines changed: 221 additions & 77 deletions

File tree

fs/netfs/direct_write.c

Lines changed: 212 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,202 @@
99
#include <linux/uio.h>
1010
#include "internal.h"
1111

12+
/*
13+
* Perform the cleanup rituals after an unbuffered write is complete.
14+
*/
15+
static void netfs_unbuffered_write_done(struct netfs_io_request *wreq)
16+
{
17+
struct netfs_inode *ictx = netfs_inode(wreq->inode);
18+
19+
_enter("R=%x", wreq->debug_id);
20+
21+
/* Okay, declare that all I/O is complete. */
22+
trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
23+
24+
if (!wreq->error)
25+
netfs_update_i_size(ictx, &ictx->inode, wreq->start, wreq->transferred);
26+
27+
if (wreq->origin == NETFS_DIO_WRITE &&
28+
wreq->mapping->nrpages) {
29+
/* mmap may have got underfoot and we may now have folios
30+
* locally covering the region we just wrote. Attempt to
31+
* discard the folios, but leave in place any modified locally.
32+
* ->write_iter() is prevented from interfering by the DIO
33+
* counter.
34+
*/
35+
pgoff_t first = wreq->start >> PAGE_SHIFT;
36+
pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
37+
38+
invalidate_inode_pages2_range(wreq->mapping, first, last);
39+
}
40+
41+
if (wreq->origin == NETFS_DIO_WRITE)
42+
inode_dio_end(wreq->inode);
43+
44+
_debug("finished");
45+
netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip);
46+
/* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */
47+
48+
if (wreq->iocb) {
49+
size_t written = umin(wreq->transferred, wreq->len);
50+
51+
wreq->iocb->ki_pos += written;
52+
if (wreq->iocb->ki_complete) {
53+
trace_netfs_rreq(wreq, netfs_rreq_trace_ki_complete);
54+
wreq->iocb->ki_complete(wreq->iocb, wreq->error ?: written);
55+
}
56+
wreq->iocb = VFS_PTR_POISON;
57+
}
58+
59+
netfs_clear_subrequests(wreq);
60+
}
61+
62+
/*
63+
* Collect the subrequest results of unbuffered write subrequests.
64+
*/
65+
static void netfs_unbuffered_write_collect(struct netfs_io_request *wreq,
66+
struct netfs_io_stream *stream,
67+
struct netfs_io_subrequest *subreq)
68+
{
69+
trace_netfs_collect_sreq(wreq, subreq);
70+
71+
spin_lock(&wreq->lock);
72+
list_del_init(&subreq->rreq_link);
73+
spin_unlock(&wreq->lock);
74+
75+
wreq->transferred += subreq->transferred;
76+
iov_iter_advance(&wreq->buffer.iter, subreq->transferred);
77+
78+
stream->collected_to = subreq->start + subreq->transferred;
79+
wreq->collected_to = stream->collected_to;
80+
netfs_put_subrequest(subreq, netfs_sreq_trace_put_done);
81+
82+
trace_netfs_collect_stream(wreq, stream);
83+
trace_netfs_collect_state(wreq, wreq->collected_to, 0);
84+
}
85+
86+
/*
87+
* Write data to the server without going through the pagecache and without
88+
* writing it to the local cache. We dispatch the subrequests serially and
89+
* wait for each to complete before dispatching the next, lest we leave a gap
90+
* in the data written due to a failure such as ENOSPC. We could, however
91+
* attempt to do preparation such as content encryption for the next subreq
92+
* whilst the current is in progress.
93+
*/
94+
static int netfs_unbuffered_write(struct netfs_io_request *wreq)
95+
{
96+
struct netfs_io_subrequest *subreq = NULL;
97+
struct netfs_io_stream *stream = &wreq->io_streams[0];
98+
int ret;
99+
100+
_enter("%llx", wreq->len);
101+
102+
if (wreq->origin == NETFS_DIO_WRITE)
103+
inode_dio_begin(wreq->inode);
104+
105+
stream->collected_to = wreq->start;
106+
107+
for (;;) {
108+
bool retry = false;
109+
110+
if (!subreq) {
111+
netfs_prepare_write(wreq, stream, wreq->start + wreq->transferred);
112+
subreq = stream->construct;
113+
stream->construct = NULL;
114+
stream->front = NULL;
115+
}
116+
117+
/* Check if (re-)preparation failed. */
118+
if (unlikely(test_bit(NETFS_SREQ_FAILED, &subreq->flags))) {
119+
netfs_write_subrequest_terminated(subreq, subreq->error);
120+
wreq->error = subreq->error;
121+
break;
122+
}
123+
124+
iov_iter_truncate(&subreq->io_iter, wreq->len - wreq->transferred);
125+
if (!iov_iter_count(&subreq->io_iter))
126+
break;
127+
128+
subreq->len = netfs_limit_iter(&subreq->io_iter, 0,
129+
stream->sreq_max_len,
130+
stream->sreq_max_segs);
131+
iov_iter_truncate(&subreq->io_iter, subreq->len);
132+
stream->submit_extendable_to = subreq->len;
133+
134+
trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
135+
stream->issue_write(subreq);
136+
137+
/* Async, need to wait. */
138+
netfs_wait_for_in_progress_stream(wreq, stream);
139+
140+
if (test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
141+
retry = true;
142+
} else if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) {
143+
ret = subreq->error;
144+
wreq->error = ret;
145+
netfs_see_subrequest(subreq, netfs_sreq_trace_see_failed);
146+
subreq = NULL;
147+
break;
148+
}
149+
ret = 0;
150+
151+
if (!retry) {
152+
netfs_unbuffered_write_collect(wreq, stream, subreq);
153+
subreq = NULL;
154+
if (wreq->transferred >= wreq->len)
155+
break;
156+
if (!wreq->iocb && signal_pending(current)) {
157+
ret = wreq->transferred ? -EINTR : -ERESTARTSYS;
158+
trace_netfs_rreq(wreq, netfs_rreq_trace_intr);
159+
break;
160+
}
161+
continue;
162+
}
163+
164+
/* We need to retry the last subrequest, so first reset the
165+
* iterator, taking into account what, if anything, we managed
166+
* to transfer.
167+
*/
168+
subreq->error = -EAGAIN;
169+
trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
170+
if (subreq->transferred > 0)
171+
iov_iter_advance(&wreq->buffer.iter, subreq->transferred);
172+
173+
if (stream->source == NETFS_UPLOAD_TO_SERVER &&
174+
wreq->netfs_ops->retry_request)
175+
wreq->netfs_ops->retry_request(wreq, stream);
176+
177+
__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
178+
__clear_bit(NETFS_SREQ_BOUNDARY, &subreq->flags);
179+
__clear_bit(NETFS_SREQ_FAILED, &subreq->flags);
180+
subreq->io_iter = wreq->buffer.iter;
181+
subreq->start = wreq->start + wreq->transferred;
182+
subreq->len = wreq->len - wreq->transferred;
183+
subreq->transferred = 0;
184+
subreq->retry_count += 1;
185+
stream->sreq_max_len = UINT_MAX;
186+
stream->sreq_max_segs = INT_MAX;
187+
188+
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
189+
stream->prepare_write(subreq);
190+
191+
__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
192+
netfs_stat(&netfs_n_wh_retry_write_subreq);
193+
}
194+
195+
netfs_unbuffered_write_done(wreq);
196+
_leave(" = %d", ret);
197+
return ret;
198+
}
199+
200+
static void netfs_unbuffered_write_async(struct work_struct *work)
201+
{
202+
struct netfs_io_request *wreq = container_of(work, struct netfs_io_request, work);
203+
204+
netfs_unbuffered_write(wreq);
205+
netfs_put_request(wreq, netfs_rreq_trace_put_complete);
206+
}
207+
12208
/*
13209
* Perform an unbuffered write where we may have to do an RMW operation on an
14210
* encrypted file. This can also be used for direct I/O writes.
@@ -70,35 +266,35 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
70266
*/
71267
wreq->buffer.iter = *iter;
72268
}
269+
270+
wreq->len = iov_iter_count(&wreq->buffer.iter);
73271
}
74272

75273
__set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags);
76-
if (async)
77-
__set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags);
78274

79275
/* Copy the data into the bounce buffer and encrypt it. */
80276
// TODO
81277

82278
/* Dispatch the write. */
83279
__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
84-
if (async)
85-
wreq->iocb = iocb;
86-
wreq->len = iov_iter_count(&wreq->buffer.iter);
87-
ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len);
88-
if (ret < 0) {
89-
_debug("begin = %zd", ret);
90-
goto out;
91-
}
92280

93-
if (!async) {
94-
ret = netfs_wait_for_write(wreq);
95-
if (ret > 0)
96-
iocb->ki_pos += ret;
97-
} else {
281+
if (async) {
282+
INIT_WORK(&wreq->work, netfs_unbuffered_write_async);
283+
wreq->iocb = iocb;
284+
queue_work(system_dfl_wq, &wreq->work);
98285
ret = -EIOCBQUEUED;
286+
} else {
287+
ret = netfs_unbuffered_write(wreq);
288+
if (ret < 0) {
289+
_debug("begin = %zd", ret);
290+
} else {
291+
iocb->ki_pos += wreq->transferred;
292+
ret = wreq->transferred ?: wreq->error;
293+
}
294+
295+
netfs_put_request(wreq, netfs_rreq_trace_put_complete);
99296
}
100297

101-
out:
102298
netfs_put_request(wreq, netfs_rreq_trace_put_return);
103299
return ret;
104300

fs/netfs/internal.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,9 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
198198
struct file *file,
199199
loff_t start,
200200
enum netfs_io_origin origin);
201+
void netfs_prepare_write(struct netfs_io_request *wreq,
202+
struct netfs_io_stream *stream,
203+
loff_t start);
201204
void netfs_reissue_write(struct netfs_io_stream *stream,
202205
struct netfs_io_subrequest *subreq,
203206
struct iov_iter *source);
@@ -212,7 +215,6 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c
212215
struct folio **writethrough_cache);
213216
ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
214217
struct folio *writethrough_cache);
215-
int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len);
216218

217219
/*
218220
* write_retry.c

fs/netfs/write_collect.c

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -399,27 +399,6 @@ bool netfs_write_collection(struct netfs_io_request *wreq)
399399
ictx->ops->invalidate_cache(wreq);
400400
}
401401

402-
if ((wreq->origin == NETFS_UNBUFFERED_WRITE ||
403-
wreq->origin == NETFS_DIO_WRITE) &&
404-
!wreq->error)
405-
netfs_update_i_size(ictx, &ictx->inode, wreq->start, wreq->transferred);
406-
407-
if (wreq->origin == NETFS_DIO_WRITE &&
408-
wreq->mapping->nrpages) {
409-
/* mmap may have got underfoot and we may now have folios
410-
* locally covering the region we just wrote. Attempt to
411-
* discard the folios, but leave in place any modified locally.
412-
* ->write_iter() is prevented from interfering by the DIO
413-
* counter.
414-
*/
415-
pgoff_t first = wreq->start >> PAGE_SHIFT;
416-
pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
417-
invalidate_inode_pages2_range(wreq->mapping, first, last);
418-
}
419-
420-
if (wreq->origin == NETFS_DIO_WRITE)
421-
inode_dio_end(wreq->inode);
422-
423402
_debug("finished");
424403
netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip);
425404
/* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */

fs/netfs/write_issue.c

Lines changed: 3 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -154,9 +154,9 @@ EXPORT_SYMBOL(netfs_prepare_write_failed);
154154
* Prepare a write subrequest. We need to allocate a new subrequest
155155
* if we don't have one.
156156
*/
157-
static void netfs_prepare_write(struct netfs_io_request *wreq,
158-
struct netfs_io_stream *stream,
159-
loff_t start)
157+
void netfs_prepare_write(struct netfs_io_request *wreq,
158+
struct netfs_io_stream *stream,
159+
loff_t start)
160160
{
161161
struct netfs_io_subrequest *subreq;
162162
struct iov_iter *wreq_iter = &wreq->buffer.iter;
@@ -698,41 +698,6 @@ ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_c
698698
return ret;
699699
}
700700

701-
/*
702-
* Write data to the server without going through the pagecache and without
703-
* writing it to the local cache.
704-
*/
705-
int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len)
706-
{
707-
struct netfs_io_stream *upload = &wreq->io_streams[0];
708-
ssize_t part;
709-
loff_t start = wreq->start;
710-
int error = 0;
711-
712-
_enter("%zx", len);
713-
714-
if (wreq->origin == NETFS_DIO_WRITE)
715-
inode_dio_begin(wreq->inode);
716-
717-
while (len) {
718-
// TODO: Prepare content encryption
719-
720-
_debug("unbuffered %zx", len);
721-
part = netfs_advance_write(wreq, upload, start, len, false);
722-
start += part;
723-
len -= part;
724-
rolling_buffer_advance(&wreq->buffer, part);
725-
if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags))
726-
netfs_wait_for_paused_write(wreq);
727-
if (test_bit(NETFS_RREQ_FAILED, &wreq->flags))
728-
break;
729-
}
730-
731-
netfs_end_issue_write(wreq);
732-
_leave(" = %d", error);
733-
return error;
734-
}
735-
736701
/*
737702
* Write some of a pending folio data back to the server and/or the cache.
738703
*/

include/trace/events/netfs.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
EM(netfs_rreq_trace_done, "DONE ") \
5858
EM(netfs_rreq_trace_end_copy_to_cache, "END-C2C") \
5959
EM(netfs_rreq_trace_free, "FREE ") \
60+
EM(netfs_rreq_trace_intr, "INTR ") \
6061
EM(netfs_rreq_trace_ki_complete, "KI-CMPL") \
6162
EM(netfs_rreq_trace_recollect, "RECLLCT") \
6263
EM(netfs_rreq_trace_redirty, "REDIRTY") \
@@ -169,7 +170,8 @@
169170
EM(netfs_sreq_trace_put_oom, "PUT OOM ") \
170171
EM(netfs_sreq_trace_put_wip, "PUT WIP ") \
171172
EM(netfs_sreq_trace_put_work, "PUT WORK ") \
172-
E_(netfs_sreq_trace_put_terminated, "PUT TERM ")
173+
EM(netfs_sreq_trace_put_terminated, "PUT TERM ") \
174+
E_(netfs_sreq_trace_see_failed, "SEE FAILED ")
173175

174176
#define netfs_folio_traces \
175177
EM(netfs_folio_is_uptodate, "mod-uptodate") \

0 commit comments

Comments
 (0)