On 08/31/2018 10:37 AM, Tushar Dave wrote:
On 08/30/2018 12:20 AM, Daniel Borkmann wrote:
On 08/30/2018 02:21 AM, Tushar Dave wrote:
On 08/29/2018 05:07 PM, Tushar Dave wrote:
While doing some preliminary testing it is found that bpf helper
bpf_msg_pull_data does not calculate the data and data_end offset
correctly. Fix it!
Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data")
Signed-off-by: Tushar Dave <tushar.n.d...@oracle.com>
Acked-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
net/core/filter.c | 38 +++++++++++++++++++++++++-------------
1 file changed, 25 insertions(+), 13 deletions(-)
diff --git a/net/core/filter.c b/net/core/filter.c
index c25eb36..3eeb3d6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2285,7 +2285,7 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
BPF_CALL_4(bpf_msg_pull_data,
struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
{
- unsigned int len = 0, offset = 0, copy = 0;
+ unsigned int len = 0, offset = 0, copy = 0, off = 0;
struct scatterlist *sg = msg->sg_data;
int first_sg, last_sg, i, shift;
unsigned char *p, *to, *from;
@@ -2299,22 +2299,30 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff
*msg)
i = msg->sg_start;
do {
len = sg[i].length;
- offset += len;
if (start < offset + len)
break;
+ offset += len;
i++;
if (i == MAX_SKB_FRAGS)
i = 0;
- } while (i != msg->sg_end);
+ } while (i <= msg->sg_end);
I don't think this condition is correct, msg operates as a scatterlist ring,
so sg_end may very well be < current i when there's a wrap-around in the
traversal ... more below.
I'm wondering then how this is suppose to work in case sg list is not
ring! For RDS, We have sg list that is not a ring. More below.
+ /* return error if start is out of range */
if (unlikely(start >= offset + len))
return -EINVAL;
- if (!msg->sg_copy[i] && bytes <= len)
- goto out;
+ /* return error if i is last entry in sglist and end is out of range */
+ if (msg->sg_copy[i] && end > offset + len)
+ return -EINVAL;
first_sg = i;
+ /* if i is not last entry in sg list and end (i.e start + bytes) is
+ * within this sg[i] then goto out and calculate data and data_end
+ */
+ if (!msg->sg_copy[i] && end <= offset + len)
+ goto out;
+
/* At this point we need to linearize multiple scatterlist
* elements or a single shared page. Either way we need to
* copy into a linear buffer exclusively owned by BPF. Then
@@ -2330,9 +2338,14 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
i++;
if (i == MAX_SKB_FRAGS)
i = 0;
- if (bytes < copy)
+ if (end < copy)
break;
- } while (i != msg->sg_end);
+ } while (i <= msg->sg_end);
+
+ /* return error if i is last entry in sglist and end is out of range */
+ if (i > msg->sg_end && end > offset + copy)
+ return -EINVAL;
+
last_sg = i;
if (unlikely(copy < end - start))
@@ -2342,23 +2355,22 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff
*msg)
if (unlikely(!page))
return -ENOMEM;
p = page_address(page);
- offset = 0;
i = first_sg;
do {
from = sg_virt(&sg[i]);
len = sg[i].length;
- to = p + offset;
+ to = p + off;
memcpy(to, from, len);
- offset += len;
+ off += len;
sg[i].length = 0;
put_page(sg_page(&sg[i]));
i++;
if (i == MAX_SKB_FRAGS)
i = 0;
- } while (i != last_sg);
+ } while (i < last_sg);
sg[first_sg].length = copy;
sg_set_page(&sg[first_sg], page, copy, 0);
@@ -2380,7 +2392,7 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
else
move_from = i + shift;
- if (move_from == msg->sg_end)
+ if (move_from > msg->sg_end)
break;
sg[i] = sg[move_from];
@@ -2396,7 +2408,7 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
if (msg->sg_end < 0)
msg->sg_end += MAX_SKB_FRAGS;
out:
- msg->data = sg_virt(&sg[i]) + start - offset;
+ msg->data = sg_virt(&sg[first_sg]) + start - offset;
msg->data_end = msg->data + bytes;
return 0;
Please discard this patch. I just noticed that Daniel Borkmann sent some
similar fixes for bpf_msg_pull_data.
Yeah I've been looking at these recently as well, please make sure you test
with the below fixes included to see if there's anything left:
I tested the latest net tree which has all the fixes you mentioned and I
am still seeing issues.
As I already mentioned before on RFC v3 thread, we need to be careful
reusing 'offset' while linearizing multiple scatterlist
elements.
Variable 'offset' is used to calculate the 'msg->data'
i.e. msg->data = sg_virt(&sg[first_sg]) + start - offset"
We should use different variable name (e.g. off) while linearizing
multiple scatterlist elements or a single shared page so that we don't
overwrite 'offset' that has correct value already been calculated.
Agree, sigh, that's also buggy, please submit the below as proper patch,
thanks!
A code change for this would look like:
diff --git a/net/core/filter.c b/net/core/filter.c
index 60a29ca..076ca09 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2326,7 +2326,7 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
BPF_CALL_4(bpf_msg_pull_data,
struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
{
- unsigned int len = 0, offset = 0, copy = 0;
+ unsigned int len = 0, offset = 0, copy = 0, off;
Nit: probably better name it 'poffset' since this is relative to p.
int bytes = end - start, bytes_sg_total;
struct scatterlist *sg = msg->sg_data;
int first_sg, last_sg, i, shift;
@@ -2354,6 +2354,8 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
* account for the headroom.
*/
bytes_sg_total = start - offset + bytes;
if (!msg->sg_copy[i] && bytes_sg_total <= len)
goto out;
@@ -2382,18 +2384,18 @@ struct sock *do_msg_redirect_map(struct sk_msg_buff
*msg)
if (unlikely(!page))
return -ENOMEM;
p = page_address(page);
- offset = 0;
+ off = 0;
i = first_sg;
do {
from = sg_virt(&sg[i]);
len = sg[i].length;
- to = p + offset;
+ to = p + off;
memcpy(to, from, len);
- offset += len;
+ off += len;
sg[i].length = 0;
put_page(sg_page(&sg[i]));
sk_msg_iter_var(i);
} while (i != last_sg);
Apart from above bug fix, I see issues in below 2 cases:
Ok.
case 1:
=======
For things like RDS, where sg list is not a ring how to know end of packet?
For example I have RDS packet of length 8192 Bytes which is in
scatterlist like
sge[0].lenghth = 1400
sge[1].length = 1448
sge[2].length = 1448
sge[3].length = 1448
sge[4].length = 1448
sge[5].length = 1000
Now when I execute "bpf_msg_pull_data(msg, 8190, 8196, 0)" on above RDS
packet, I expect bpf_msg_pull_data to return error because end is out of
bound e.g. 8196 > 8192. Instead current code wraps it to sge index 0!
That is if I dump first 6 bytes starting with 'start' I get byte value
at offset 8190, 8191 from sge[5] and value at offset 8192, 8193, 8194
8195 bytes from sge[0] from offset 0 ,1 ,2 ,3 respectively - which is
not expected!
For the sg ring case which is what is upstream today, it will find start_sg
as 5 and last_sg as 6. And therefore bail out in bytes_sg_total > copy test
since 1004 > 1000. sge[5] offset is 7192 with 1000 bytes len, so start is at
offset 998 from there and end at 1004. So I get -EINVAL from the mentioned
test.
Is msg->sg_start and msg->sg_end set properly in your RDS case? As long as
there's no wrap-around the helper should be usable also in RDS case. I'm not
sure though why you see above oob.
case 2:
=======
Same example of RDS packet as above, when I execute
"bpf_msg_pull_data(msg, 7191, 8192, 0)"
I get -EINVAL which is *wrong*.
Debugging this, I found, right before we enter below do while loop the values
of variables are:
i = 4
first_sg = 4
last_sg= 5
start=7191
bytes_sg_total=2448
offset=5744
copy=0
len=1448
do {
copy += sg[i].length;
sk_msg_iter_var(i);
if (bytes_sg_total <= copy)
break;
} while (i != msg->sg_end);
last_sg = i;
However, above loop execute only one time because after one execution,
i=5 and msg->sg_end=5. That makes condition "while (i != msg->sg_end)"
false. So when loop exit, the values are:
i = 5
first_sg=4
last_sg= 5
start=7191
bytes_sg_total=2448 <<<<<<
offset=5744
copy=1448 <<<<<<
len=1448
And that makes below condition true
if (unlikely(bytes_sg_total > copy))
return -EINVAL;
Hmm, is your msg->sg_end correct? I'm getting start_sg of 4 and last_sg of 6.
So it breaks on the bytes_sg_total <= copy test, and then it merges sges 4 and 5
into 4 with 2448 length while dropping sge 5. How do you set up sk_msg_buff?
There may be one or more similar corner scenarios like case 1 and 2
which can be fixed however I am not sure how can we fix so that we can
get it right for sg ring and sg list?
Imho, it should be sufficient to have invariant of msg->sg_start < msg->sg_end
and having msg->sg_start point to the starting sge while msg->sg_end to the
last sge slot + 1.