<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:12:18 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-977] incorrect round robin object allocation </title>
                <link>https://jira.whamcloud.com/browse/LU-977</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;&lt;a href=&quot;https://bugzilla.lustre.org/show_bug.cgi?id=24194&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://bugzilla.lustre.org/show_bug.cgi?id=24194&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;bug issued due incorrect locking in lov_qos code and can be easy replicated by test&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;diff --git a/lustre/lov/lov_qos.c b/lustre/lov/lov_qos.c 
index a101e9c..64ccefb 100644 
--- a/lustre/lov/lov_qos.c 
+++ b/lustre/lov/lov_qos.c 
@@ -627,6 +627,8 @@ static int alloc_rr(struct lov_obd *lov, int *idx_arr, int *stripe_cnt, 

 repeat_find: 
         array_idx = (lqr-&amp;gt;lqr_start_idx + lqr-&amp;gt;lqr_offset_idx) % osts-&amp;gt;op_count; 
+ CFS_FAIL_TIMEOUT_MS(OBD_FAIL_MDS_LOV_CREATE_RACE, 100); 
+ 
         idx_pos = idx_arr; 
 #ifdef QOS_DEBUG 
         CDEBUG(D_QOS, &quot;pool &apos;%s&apos; want %d startidx %d startcnt %d offset %d &quot;

test_51() {
        local obj1
        local obj2
        local old_rr

        mkdir -p $DIR1/$tfile-1/
        mkdir -p $DIR2/$tfile-2/
        old_rr=$(do_facet $SINGLEMDS lctl get_param -n &apos;lov.lustre-MDT*/qos_threshold_rr&apos; | sed -e
&apos;s/%//&apos;)
        do_facet $SINGLEMDS lctl set_param -n &apos;lov.lustre-MDT*/qos_threshold_rr&apos; 100
#define OBD_FAIL_MDS_LOV_CREATE_RACE     0x148
        do_facet $SINGLEMDS &quot;lctl set_param fail_loc=0x80000148&quot;
        touch $DIR1/$tfile-1/file1 &amp;amp;
        PID1=$!
        touch $DIR2/$tfile-2/file2 &amp;amp;
        PID2=$!
        wait $PID2
        wait $PID1
        do_facet $SINGLEMDS &quot;lctl set_param fail_loc=0x0&quot;
        do_facet $SINGLEMDS &quot;lctl set_param -n &apos;lov.lustre-MDT*/qos_threshold_rr&apos; $old_rr&quot;

        obj1=$($GETSTRIPE -o $DIR1/$tfile-1/file1)
        obj2=$($GETSTRIPE -o $DIR1/$tfile-2/file2)
        [ $obj1 -eq $obj2 ] &amp;amp;&amp;amp; error &quot;must different ost used&quot;
}
run_test 51 &quot;alloc_rr should be allocate on correct order&quot;
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;bug found in 2.x but should be exist in 1.8 also.&lt;/p&gt;

&lt;p&gt;CFS_FAIL_TIMEOUT_MS can be replaced with CFS_RACE()&lt;/p&gt;</description>
                <environment>any lustre from a 1.6.0</environment>
        <key id="12848">LU-977</key>
            <summary>incorrect round robin object allocation </summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="3" iconUrl="https://jira.whamcloud.com/images/icons/priorities/major.svg">Major</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bogl">Bob Glossman</assignee>
                                    <reporter username="shadow">Alexey Lyashkov</reporter>
                        <labels>
                            <label>llnl</label>
                            <label>patch</label>
                    </labels>
                <created>Tue, 10 Jan 2012 01:33:36 +0000</created>
                <updated>Wed, 19 Oct 2022 01:03:27 +0000</updated>
                            <resolved>Fri, 21 Aug 2015 22:10:36 +0000</resolved>
                                                    <fixVersion>Lustre 2.8.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>18</watches>
                                                                            <comments>
                            <comment id="33656" author="shadow" created="Fri, 6 Apr 2012 02:18:57 +0000"  >&lt;p&gt;Andreas,&lt;/p&gt;

&lt;p&gt;it&apos;s not a 1.8 only problem, that problem exist from initial LOV QoS implementation in 1.6.0.&lt;/p&gt;</comment>
                            <comment id="33776" author="shadow" created="Fri, 6 Apr 2012 05:38:55 +0000"  >&lt;p&gt;remote: New Changes:&lt;br/&gt;
remote:   &lt;a href=&quot;http://review.whamcloud.com/2462&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/2462&lt;/a&gt;&lt;br/&gt;
remote: &lt;/p&gt;</comment>
                            <comment id="46089" author="shadow" created="Sat, 6 Oct 2012 14:45:49 +0000"  >&lt;p&gt;I glad to see, someone from WC look to patches after half year of waiting. Very nice speed.&lt;br/&gt;
Can you explain - did you want patches or not?&lt;/p&gt;

&lt;p&gt;Now that patch need totally reworked as LOV code moved into LOD, and OSP have different object allocation strategy.&lt;br/&gt;
may WC port that patch ourself or i need rework it?&lt;/p&gt;</comment>
                            <comment id="46091" author="pjones" created="Sat, 6 Oct 2012 15:43:28 +0000"  >&lt;p&gt;Shadow&lt;/p&gt;

&lt;p&gt;If you ever have any concerns that something has not been given the correct priority then please raise to the CDWG via your representative (Nic Henke)&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="46151" author="bzzz" created="Mon, 8 Oct 2012 08:36:37 +0000"  >&lt;p&gt;Alexey, this work (I mean lod/osp) started in Sun/Oracle, we presented all this internally and for public few times. &lt;/p&gt;

&lt;p&gt;I think it makes sense to write a detailed explanation for what specifically wrong with QoS (which is know to be imperfect)&lt;br/&gt;
and how this can be fixed in terms of HLD. then it&apos;ll be easier to see how to fit it better into the new model.&lt;/p&gt;</comment>
                            <comment id="46158" author="shadow" created="Mon, 8 Oct 2012 10:04:05 +0000"  >&lt;p&gt;Alex,&lt;/p&gt;

&lt;p&gt;I understand it&apos;s started by Sun/Oracle - but i think bugfixes should be incorporated before it will pushed in repository. That is not a bug in HLD, but bug in implementation.&lt;/p&gt;

&lt;p&gt;currently lov/lod (i see LOD have same bug) internal state &lt;br/&gt;
&amp;gt;&amp;gt;&lt;br/&gt;
lqr-&amp;gt;lqr_start_idx + lqr-&amp;gt;lqr_offset_idx)&lt;br/&gt;
&amp;gt;&amp;gt;&lt;/p&gt;

&lt;p&gt;isn&apos;t protected with any locks in that case we able to start allocate objects for more then one process on same osc target &lt;/p&gt;


&lt;p&gt;        cfs_down_read(&amp;amp;m-&amp;gt;lod_qos.lq_rw_sem);  &amp;lt;&amp;lt; allow parallel modification for lqr_start_idx.&lt;br/&gt;
        ost_start_idx_temp = lqr-&amp;gt;lqr_start_idx;&lt;/p&gt;

&lt;p&gt;repeat_find:&lt;br/&gt;
        array_idx = (lqr-&amp;gt;lqr_start_idx + lqr-&amp;gt;lqr_offset_idx) %&lt;br/&gt;
                        osts-&amp;gt;op_count;&lt;br/&gt;
but we was not able to protect it via spinlock (exclusive mutex is bad for fast path)&lt;br/&gt;
because obd_prealloc may sleep - so it&apos;s introduce a separate thread to create an object as LOD already have.&lt;/p&gt;

&lt;p&gt;that is first bug in that area.&lt;/p&gt;

&lt;p&gt;second bug, related to data targets pools (sory don&apos;t remember after year ago).&lt;/p&gt;

&lt;p&gt;and some optimization to avoid find pool again if RR allocation chooses. &lt;/p&gt;
</comment>
                            <comment id="46160" author="shadow" created="Mon, 8 Oct 2012 10:06:05 +0000"  >&lt;p&gt;Peter,&lt;/p&gt;

&lt;p&gt;it&apos;s good question - so if someone will don&apos;t ping you - you never will look in review queue? don&apos;t try to find a dependencies for &lt;em&gt;already&lt;/em&gt; submitted fixes. I don&apos;t talk about tickets without fixes, but submitted patch should reviewed for half year, in CFS time you never say something similar that.&lt;/p&gt;</comment>
                            <comment id="46168" author="pjones" created="Mon, 8 Oct 2012 10:47:00 +0000"  >&lt;p&gt;Shadow&lt;/p&gt;

&lt;p&gt;I am not 100% confident that I follow what you are saying but two possible threads seem to be&lt;/p&gt;

&lt;p&gt;1) Development work done in parallel has clashed with this particular suggested change since it was initially submitted and this was not caught ahead of time&lt;/p&gt;

&lt;p&gt;2) This patch did not receive apparent attention for a long time.&lt;/p&gt;

&lt;p&gt;Your initial comment I took to refer to just 2 and I was reluctant to clutter up a ticket with talk of process, but the simple matter is that we have more things we could work on than time and we need to prioritize( and I know that you know about such matters because I note &lt;a href=&quot;http://review.whamcloud.com/#change,2342&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,2342&lt;/a&gt; has been waiting similarly long period for your attention). &lt;/p&gt;

&lt;p&gt;This suggested change was reviewed when it first came in and put in a lower priority category because it was not a stability issue&lt;/p&gt;

&lt;p&gt;My comment about the CDWG is because that is the agreed forum to work our prioritization of our attention to issues. Making comments on tickets is not helpful because it is easy for those to get missed (I stumbled upon the above by pure chance)&lt;/p&gt;

&lt;p&gt;As for 1, I&apos;m afraid that this is a new challenge for us in our growing and more diverse Lustre development community. Things were certainly easier to manage in the CFS days, but I think that the way we are going to overcome these challenges is by good quality communication. Again, I think that the CDWG is a natural forum for that to take place.&lt;/p&gt;

&lt;p&gt;Anyway, sorry if I am missing the point altogether but there is a CDWG meeting on Wednesday so hopefully everything can get clarified then.&lt;/p&gt;

&lt;p&gt;Regards&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="48229" author="nrutman" created="Wed, 21 Nov 2012 17:43:30 +0000"  >&lt;p&gt;Xyratex-bug-id: &lt;a href=&quot;http://jira-nss.xy01.xyratex.com:8080/browse/MRP-206&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;MRP-206&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="49987" author="keith" created="Fri, 4 Jan 2013 21:32:22 +0000"  >&lt;p&gt;Is there gong to be forward progress with this issue? &lt;/p&gt;</comment>
                            <comment id="50066" author="nrutman" created="Mon, 7 Jan 2013 13:15:11 +0000"  >&lt;p&gt;I think Shadow was frustrated that he submitted a patch that was ignored for a long time.  If it had been landed, it would have been included in the LOD porting.  Since it was not, new work needs to be done to port it.  He asks if Intel will port (and land) the patch or whether he needs to port it himself and resubmit.&lt;/p&gt;</comment>
                            <comment id="50080" author="pjones" created="Mon, 7 Jan 2013 16:35:22 +0000"  >&lt;p&gt;Nathan&lt;/p&gt;

&lt;p&gt;It is quite possible that Intel can assist in the work necessary to bring this patch up to date if the CDWG thinks that this warrants attention over other competing priorities. To date, the Xyratex representative on the CDWG has not raised this (or any other issue) as warranting more attention than it is presently receiving. There is another call coming up this Wednesday so Xyratex will be able to raise it then if need be.&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="50100" author="shadow" created="Tue, 8 Jan 2013 02:32:55 +0000"  >&lt;p&gt;Peter,&lt;/p&gt;

&lt;p&gt;that patch need rewrites completely because LOV layer is removed and LOD introduced.&lt;br/&gt;
May be better to ask LOD author to fix it, anyway that is easy now - because lod/osc introduce creation thread already.&lt;/p&gt;</comment>
                            <comment id="50104" author="bzzz" created="Tue, 8 Jan 2013 04:04:40 +0000"  >&lt;p&gt;this can not be done easily because lod_alloc_rr() is doing allocation within that loop, so we can&apos;t put the whole loop under a spinlock.&lt;/p&gt;

&lt;p&gt;but probably we can shift lqr_start_idx to the next OST when another OST is used in the striping:&lt;/p&gt;

&lt;ul&gt;
	&lt;li&gt;We&apos;ve successfuly declared (reserved) an object&lt;br/&gt;
		 */&lt;br/&gt;
		lod_qos_ost_in_use(env, stripe_idx, ost_idx);&lt;br/&gt;
		lo-&amp;gt;ldo_stripe&lt;span class=&quot;error&quot;&gt;&amp;#91;stripe_idx&amp;#93;&lt;/span&gt; = o;&lt;br/&gt;
		stripe_idx++;&lt;br/&gt;
+               spin_lock(...);&lt;br/&gt;
+               lqr-&amp;gt;lqr_start_idx = next(ost_idx);&lt;br/&gt;
+               spin_lock(...);&lt;/li&gt;
&lt;/ul&gt;


&lt;p&gt;I don&apos;t think QoS is supposed to be absolutely reliable in terms of &quot;X is used, move to Y&quot;. some small &quot;mistakes&quot; and variation should be OK, IMHO.&lt;/p&gt;

&lt;p&gt;as for the second problem I&apos;d like to see a bit better description, if possible.&lt;/p&gt;</comment>
                            <comment id="50332" author="shadow" created="Fri, 11 Jan 2013 05:06:11 +0000"  >&lt;p&gt;may be that is solution - because original problem when we have isn&apos;t same allocation on whole OST in cluster.&lt;br/&gt;
other notices from my view - we may kill a statfs from that loop - because that is too slow operation in fast path.&lt;/p&gt;

&lt;p&gt;PS. was wrong. that is not a solution - because we may shift for whole loop when release a spinlock, so will allocate two objects on same ost for one file. &lt;/p&gt;
</comment>
                            <comment id="50333" author="bzzz" created="Fri, 11 Jan 2013 05:09:16 +0000"  >&lt;p&gt;well, statfs() is basically a memcpy() in this case.&lt;/p&gt;</comment>
                            <comment id="50336" author="bzzz" created="Fri, 11 Jan 2013 06:20:52 +0000"  >&lt;p&gt;again, I think there is no requirement for the algorithm to be totally precise.. and if for some reason you want serialization, just do not shift - take and increment current lqr_start_idx on the every iteration.&lt;/p&gt;</comment>
                            <comment id="50389" author="shadow" created="Sun, 13 Jan 2013 04:44:06 +0000"  >&lt;p&gt;Alex,&lt;/p&gt;

&lt;p&gt;we have two requirements&lt;br/&gt;
1) MD object should be an allocate objects from different OST&apos;s, and avoid situation when two objects from same ost assigned to one MD object (will reduce speed).&lt;br/&gt;
2) whole allocation should be distribute ost objects evenly over all ost&apos;s - again we need evenly load for a all ost&apos;s.&lt;/p&gt;
</comment>
                            <comment id="50390" author="bzzz" created="Sun, 13 Jan 2013 04:49:47 +0000"  >&lt;p&gt;the 2nd requirement can&apos;t be achieved just because object doesn&apos;t imply same amount of data and IO pattern. so, I don&apos;t think some variation will be that bad.&lt;/p&gt;</comment>
                            <comment id="50404" author="shadow" created="Mon, 14 Jan 2013 06:23:28 +0000"  >&lt;p&gt;Alex,&lt;/p&gt;

&lt;p&gt;about second, i mean if we have 20 allocations and 5 ost&apos;s - we need to have 4 allocations on each ost&apos;s - otherwise that is isn&apos;t round-robin allocation. and we have more load to same one or more ost&apos;s with same workload pattern.&lt;/p&gt;</comment>
                            <comment id="54098" author="shadow" created="Fri, 15 Mar 2013 07:04:36 +0000"  >&lt;p&gt;did you have plans to fix it?&lt;/p&gt;</comment>
                            <comment id="54327" author="keith" created="Tue, 19 Mar 2013 00:27:55 +0000"  >&lt;p&gt;Alexey,  What is the worst case allocation that you have seen?  It still sounds like you want a &quot;totally precise&quot; client / ost allocation mapping.  &lt;/p&gt;</comment>
                            <comment id="54345" author="shadow" created="Tue, 19 Mar 2013 05:38:46 +0000"  >&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Eugene Birkine added a comment - 06/Dec/11 9:05 PM
Debug log file from MDS with qos_threshold_rr=100 during 16 file writes. The file distribution was:
testfs-OST0000
2
testfs-OST0001
3
testfs-OST0002
2
testfs-OST0003
1
testfs-OST0004
2
testfs-OST0005
3
testfs-OST0006
1
testfs-OST0007
2
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
</comment>
                            <comment id="54454" author="bzzz" created="Wed, 20 Mar 2013 11:01:43 +0000"  >&lt;p&gt;totally precise RR is not possible with DNE, for example.&lt;/p&gt;</comment>
                            <comment id="58978" author="spitzcor" created="Tue, 21 May 2013 15:38:19 +0000"  >&lt;p&gt;I don&apos;t see how &apos;precise&apos; RR is not possible with DNE.  If an application wants evenly balanced stripe allocation, that should still be possible as the allocators aren&apos;t linked in DNE.  So then if the one MDS allocator hasn&apos;t switched to the space-based allocator, then round-robin should still be (mostly) &apos;precise&apos;, correct?&lt;/p&gt;</comment>
                            <comment id="58999" author="shadow" created="Tue, 21 May 2013 18:39:14 +0000"  >&lt;p&gt;If i correctly understand Alex, they mean different MDT may allocate on same OST so OST may have a different allocated objects. But it&apos;s false if each MDT have own OST pools assigned.&lt;/p&gt;</comment>
                            <comment id="59348" author="bzzz" created="Mon, 27 May 2013 05:26:24 +0000"  >&lt;p&gt;Cory, notice &quot;totally&quot;, which is possible only with very strong locking around allocation, IMHO. which is virtually not possible with DNE and not very good on multicore?&lt;/p&gt;</comment>
                            <comment id="113724" author="gerrit" created="Wed, 29 Apr 2015 08:08:48 +0000"  >&lt;p&gt;Rahul Deshmukh (rahul.deshmukh@seagate.com) uploaded a new patch: &lt;a href=&quot;http://review.whamcloud.com/14636&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/14636&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-977&quot; title=&quot;incorrect round robin object allocation &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-977&quot;&gt;&lt;del&gt;LU-977&lt;/del&gt;&lt;/a&gt; lod: Patch to protect lqr_start_idx&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: 1&lt;br/&gt;
Commit: 86aa10e5b8c6b944c10ce224078ba4f6aafbe6eb&lt;/p&gt;</comment>
                            <comment id="113725" author="520557" created="Wed, 29 Apr 2015 08:12:06 +0000"  >&lt;p&gt;Posted the new patch to address this issue. Please review.&lt;/p&gt;</comment>
                            <comment id="120548" author="520557" created="Tue, 7 Jul 2015 11:56:18 +0000"  >&lt;p&gt;Adding testing output demonstrating the fix :&lt;/p&gt;

&lt;p&gt;NOTE: The tests were run on real hardware on Lustre 2.5.1 version and with the fix where separate function was not created for the function calls inside the loop. I hope that&apos;s fine.&lt;/p&gt;

&lt;p&gt;WITHOUT Fix&lt;br/&gt;
=============&lt;br/&gt;
For reproducing the issue, a cluster with 12 OSTs and 12 client machines where used. Each client is a 24 cpu cores machine.&lt;br/&gt;
IOR load is like the first example from CEAP-82 with smaller i/o transfer and block parameters.&lt;br/&gt;
number of threads is 12 * 24, 24 threads per each client.&lt;br/&gt;
To increase concurrency, each thread was operating through own lustre mount point, so each client had 24 lustre mounts.&lt;br/&gt;
the main script:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[CEAP-82]$ cat ceap-82.sh 
#! /bin/bash


MPIEXEC=/home/bloewe/libs/mpich-3.1/install/bin/mpiexec
# EXE=/home/bloewe/benchmarks/mdtest-1.9.3/mdtest
EXE=/home/bloewe/benchmarks/IOR-2.10.3/src/C/IOR
NPROCS=$((12 * 24))
TARGET=/mnt/lustre/ceap-82
MPISCRIPT=/home/bloewe/CEAP-82/mpi-script.sh

HOSTS=$PWD/hostfile

# OPTS=&lt;span class=&quot;code-quote&quot;&gt;&quot;-a POSIX -B -C -E -F -e -g -k -b 4g -t 32m -vvv -o /lustre/crayadm/tmp/testdir.12403/IOR_POSIX&quot;&lt;/span&gt;
# OPTS=&lt;span class=&quot;code-quote&quot;&gt;&quot;-a POSIX -B -C -E -F -e -g -k -b 200m -t 2m -vvv -o /mnt/lustre/ceap-82/IOR_POSIX&quot;&lt;/span&gt;
OPTS=&lt;span class=&quot;code-quote&quot;&gt;&quot;-a POSIX -B -C -E -F -e -g -k -b 200m -t 2m -v&quot;&lt;/span&gt;


# mkdir -p distr
# 
# &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; n in {0..100}; &lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt; 
# 	rm -f /mnt/lustre/ceap-82/IOR_POSIX*
# 	$MPIEXEC -f $HOSTS -np $NPROCS $EXE $OPTS
# 	&lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; x in /mnt/lustre/ceap-82/IOR_POSIX*; &lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt;
# 		lfs getstripe -i $x
# 	done | sort -n | uniq -c &amp;gt; distr/d-$n
# done

rm -f /mnt/lustre/ceap-82/IOR_POSIX*
$MPIEXEC -f $HOSTS -np $NPROCS $MPISCRIPT $OPTS
&lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; x in /mnt/lustre/ceap-82/IOR_POSIX*; &lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt;
     lfs getstripe -i $x
done | sort -n | uniq -c
[CEAP-82]$ 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;and an IOR wrapper, needed to map 24 IOR instances to 24 lustre mount points:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[CEAP-82]$ cat mpi-script.sh 
#! /bin/bash

LMOUNT=/mnt/lustre$(($PMI_RANK % 24))
exec /home/bloewe/benchmarks/IOR-2.10.3/src/C/IOR &lt;span class=&quot;code-quote&quot;&gt;&quot;$@&quot;&lt;/span&gt; -o $LMOUNT/ceap-82/IOR_POSIX
[CEAP-82]$ 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;
&lt;p&gt;This way we simulate client load from 24 * 12 = 288 clients.&lt;br/&gt;
There result should be equal distribution of IOR working files between all OSTs, 24 files per OST, if RR algorithm works correctly.&lt;/p&gt;

&lt;p&gt;However , result of the test run shows uneven files distribution:&lt;/p&gt;

&lt;p&gt;Max Write: 5988.51 MiB/sec (6279.41 MB/sec)&lt;br/&gt;
Max Read:  10008.34 MiB/sec (10494.51 MB/sec)&lt;/p&gt;

&lt;p&gt;Run finished: Sat Mar 21 14:10:12 2015&lt;br/&gt;
     26 0&lt;br/&gt;
     25 1&lt;br/&gt;
     24 2&lt;br/&gt;
     29 3&lt;br/&gt;
     20 4&lt;br/&gt;
     23 5&lt;br/&gt;
     24 6&lt;br/&gt;
     25 7&lt;br/&gt;
     25 8&lt;br/&gt;
     24 9&lt;br/&gt;
     23 10&lt;br/&gt;
     20 11&lt;br/&gt;
&lt;span class=&quot;error&quot;&gt;&amp;#91;CEAP-82&amp;#93;&lt;/span&gt;$ &lt;/p&gt;

&lt;p&gt;number of files per OST varies from 20 to 29. It cannot be explained by &quot;reseeds&quot; in lod_rr_alloc().&lt;/p&gt;

&lt;p&gt;The log from MDS ssh session with setting qos_threshold_rr:&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;[root@orange00 ~]# pdsh -g mds lctl set_param lod.*.qos_threshold_rr=100
orange03: error: set_param: /proc/{fs,sys}/{lnet,lustre}/lod/*/qos_threshold_rr: Found no match
pdsh@orange00: orange03: ssh exited with exit code 3
orange10: lod.orangefs-MDT0001-mdtlov.qos_threshold_rr=100
orange12: lod.orangefs-MDT0003-mdtlov.qos_threshold_rr=100
orange11: lod.orangefs-MDT0002-mdtlov.qos_threshold_rr=100
orange13: lod.orangefs-MDT0004-mdtlov.qos_threshold_rr=100
orange02: lod.orangefs-MDT0000-mdtlov.qos_threshold_rr=100
[root@orange00 ~]# slogin orange02
Last login: Sat Mar 21 14:07:41 PDT 2015 from 172.16.2.3 on ssh
[root@orange02 ~]# lctl set_param lctl set_param debug=-1 subsystem_debug=lov debug_mb=1200
error: set_param: /proc/{fs,sys}/{lnet,lustre}/lctl: Found no match
debug=-1
subsystem_debug=lov
debug_mb=1200
[root@orange02 ~]# lctl set_param debug=-1 subsystem_debug=lov debug_mb=1200
debug=-1
subsystem_debug=lov
debug_mb=1200
[root@orange02 ~]# lctl dk &amp;gt; /dev/&lt;span class=&quot;code-keyword&quot;&gt;null&lt;/span&gt;
[root@orange02 ~]# lctl dk /tmp/ceap-82.txt
Debug log: 5776 lines, 5776 kept, 0 dropped, 0 bad.
[root@orange02 ~]# lctl get_param lod.*.qos_threshold_rr
lod.orangefs-MDT0000-mdtlov.qos_threshold_rr=100%
[root@orange02 ~]# less /tmp/ceap-82.txt 
[root@orange02 ~]# logout
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;



&lt;p&gt;WITH Fix&lt;br/&gt;
=======&lt;/p&gt;
&lt;div class=&quot;code panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;codeContent panelContent&quot;&gt;
&lt;pre class=&quot;code-java&quot;&gt;Following 5 runs showed equal files distribution across all OSTs:
 [CEAP-82]$ &lt;span class=&quot;code-keyword&quot;&gt;for&lt;/span&gt; n in {1..5} ; &lt;span class=&quot;code-keyword&quot;&gt;do&lt;/span&gt; sh ceap-82.sh; done
IOR-2.10.3: MPI Coordinated Test of Parallel I/O

Run began: Mon Mar 23 02:27:28 2015
Command line used: /home/bloewe/benchmarks/IOR-2.10.3/src/C/IOR -a POSIX -B -C -E -F -e -g -k -b 200m -t 2m -o /mnt/lustre0/ceap-82/IOR_POSIX
Machine: Linux sjsc-321

Summary:
	api                = POSIX
	test filename      = /mnt/lustre0/ceap-82/IOR_POSIX
	access             = file-per-process
	ordering in a file = sequential offsets
	ordering inter file=constant task offsets = 1
	clients            = 288 (24 per node)
	repetitions        = 1
	xfersize           = 2 MiB
	blocksize          = 200 MiB
	aggregate filesize = 56.25 GiB

Operation  Max (MiB)  Min (MiB)  Mean (MiB)   Std Dev  Max (OPs)  Min (OPs)  Mean (OPs)   Std Dev  Mean (s)  
---------  ---------  ---------  ----------   -------  ---------  ---------  ----------   -------  --------
write        4310.41    4310.41     4310.41      0.00    2155.21    2155.21     2155.21      0.00  13.36299   EXCEL
read        10076.77   10076.77    10076.77      0.00    5038.38    5038.38     5038.38      0.00   5.71612   EXCEL

Max Write: 4310.41 MiB/sec (4519.80 MB/sec)
Max Read:  10076.77 MiB/sec (10566.26 MB/sec)

Run finished: Mon Mar 23 02:27:48 2015
     24 0
     24 1
     24 2
     24 3
     24 4
     24 5
     24 6
     24 7
     24 8
     24 9
     24 10
     24 11
IOR-2.10.3: MPI Coordinated Test of Parallel I/O

Run began: Mon Mar 23 02:27:50 2015
Command line used: /home/bloewe/benchmarks/IOR-2.10.3/src/C/IOR -a POSIX -B -C -E -F -e -g -k -b 200m -t 2m -o /mnt/lustre0/ceap-82/IOR_POSIX
Machine: Linux sjsc-321

Summary:
	api                = POSIX
	test filename      = /mnt/lustre0/ceap-82/IOR_POSIX
	access             = file-per-process
	ordering in a file = sequential offsets
	ordering inter file=constant task offsets = 1
	clients            = 288 (24 per node)
	repetitions        = 1
	xfersize           = 2 MiB
	blocksize          = 200 MiB
	aggregate filesize = 56.25 GiB

Operation  Max (MiB)  Min (MiB)  Mean (MiB)   Std Dev  Max (OPs)  Min (OPs)  Mean (OPs)   Std Dev  Mean (s)  
---------  ---------  ---------  ----------   -------  ---------  ---------  ----------   -------  --------
write        4269.62    4269.62     4269.62      0.00    2134.81    2134.81     2134.81      0.00  13.49066   EXCEL
read        10078.78   10078.78    10078.78      0.00    5039.39    5039.39     5039.39      0.00   5.71498   EXCEL

Max Write: 4269.62 MiB/sec (4477.02 MB/sec)
Max Read:  10078.78 MiB/sec (10568.37 MB/sec)

Run finished: Mon Mar 23 02:28:10 2015
     24 0
     24 1
     24 2
     24 3
     24 4
     24 5
     24 6
     24 7
     24 8
     24 9
     24 10
     24 11
IOR-2.10.3: MPI Coordinated Test of Parallel I/O

Run began: Mon Mar 23 02:28:11 2015
Command line used: /home/bloewe/benchmarks/IOR-2.10.3/src/C/IOR -a POSIX -B -C -E -F -e -g -k -b 200m -t 2m -o /mnt/lustre0/ceap-82/IOR_POSIX
Machine: Linux sjsc-321

Summary:
	api                = POSIX
	test filename      = /mnt/lustre0/ceap-82/IOR_POSIX
	access             = file-per-process
	ordering in a file = sequential offsets
	ordering inter file=constant task offsets = 1
	clients            = 288 (24 per node)
	repetitions        = 1
	xfersize           = 2 MiB
	blocksize          = 200 MiB
	aggregate filesize = 56.25 GiB

Operation  Max (MiB)  Min (MiB)  Mean (MiB)   Std Dev  Max (OPs)  Min (OPs)  Mean (OPs)   Std Dev  Mean (s)  
---------  ---------  ---------  ----------   -------  ---------  ---------  ----------   -------  --------
write        4343.86    4343.86     4343.86      0.00    2171.93    2171.93     2171.93      0.00  13.26011   EXCEL
read        10091.59   10091.59    10091.59      0.00    5045.80    5045.80     5045.80      0.00   5.70772   EXCEL

Max Write: 4343.86 MiB/sec (4554.86 MB/sec)
Max Read:  10091.59 MiB/sec (10581.80 MB/sec)

Run finished: Mon Mar 23 02:28:31 2015
     24 0
     24 1
     24 2
     24 3
     24 4
     24 5
     24 6
     24 7
     24 8
     24 9
     24 10
     24 11
IOR-2.10.3: MPI Coordinated Test of Parallel I/O

Run began: Mon Mar 23 02:28:33 2015
Command line used: /home/bloewe/benchmarks/IOR-2.10.3/src/C/IOR -a POSIX -B -C -E -F -e -g -k -b 200m -t 2m -o /mnt/lustre0/ceap-82/IOR_POSIX
Machine: Linux sjsc-321

Summary:
	api                = POSIX
	test filename      = /mnt/lustre0/ceap-82/IOR_POSIX
	access             = file-per-process
	ordering in a file = sequential offsets
	ordering inter file=constant task offsets = 1
	clients            = 288 (24 per node)
	repetitions        = 1
	xfersize           = 2 MiB
	blocksize          = 200 MiB
	aggregate filesize = 56.25 GiB

Operation  Max (MiB)  Min (MiB)  Mean (MiB)   Std Dev  Max (OPs)  Min (OPs)  Mean (OPs)   Std Dev  Mean (s)  
---------  ---------  ---------  ----------   -------  ---------  ---------  ----------   -------  --------
write        4169.55    4169.55     4169.55      0.00    2084.78    2084.78     2084.78      0.00  13.81443   EXCEL
read         9997.98    9997.98     9997.98      0.00    4998.99    4998.99     4998.99      0.00   5.76116   EXCEL

Max Write: 4169.55 MiB/sec (4372.09 MB/sec)
Max Read:  9997.98 MiB/sec (10483.64 MB/sec)

Run finished: Mon Mar 23 02:28:53 2015
     24 0
     24 1
     24 2
     24 3
     24 4
     24 5
     24 6
     24 7
     24 8
     24 9
     24 10
     24 11
IOR-2.10.3: MPI Coordinated Test of Parallel I/O

Run began: Mon Mar 23 02:28:56 2015
Command line used: /home/bloewe/benchmarks/IOR-2.10.3/src/C/IOR -a POSIX -B -C -E -F -e -g -k -b 200m -t 2m -o /mnt/lustre0/ceap-82/IOR_POSIX
Machine: Linux sjsc-321

Summary:
	api                = POSIX
	test filename      = /mnt/lustre0/ceap-82/IOR_POSIX
	access             = file-per-process
	ordering in a file = sequential offsets
	ordering inter file=constant task offsets = 1
	clients            = 288 (24 per node)
	repetitions        = 1
	xfersize           = 2 MiB
	blocksize          = 200 MiB
	aggregate filesize = 56.25 GiB

Operation  Max (MiB)  Min (MiB)  Mean (MiB)   Std Dev  Max (OPs)  Min (OPs)  Mean (OPs)   Std Dev  Mean (s)  
---------  ---------  ---------  ----------   -------  ---------  ---------  ----------   -------  --------
write        4427.20    4427.20     4427.20      0.00    2213.60    2213.60     2213.60      0.00  13.01049   EXCEL
read        10026.62   10026.62    10026.62      0.00    5013.31    5013.31     5013.31      0.00   5.74471   EXCEL

Max Write: 4427.20 MiB/sec (4642.25 MB/sec)
Max Read:  10026.62 MiB/sec (10513.67 MB/sec)

Run finished: Mon Mar 23 02:29:15 2015
     24 0
     24 1
     24 2
     24 3
     24 4
     24 5
     24 6
     24 7
     24 8
     24 9
     24 10
     24 11
[CEAP-82]$ 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="120727" author="gerrit" created="Wed, 8 Jul 2015 17:19:50 +0000"  >&lt;p&gt;Oleg Drokin (oleg.drokin@intel.com) merged in patch &lt;a href=&quot;http://review.whamcloud.com/14636/&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/14636/&lt;/a&gt;&lt;br/&gt;
Subject: &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-977&quot; title=&quot;incorrect round robin object allocation &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-977&quot;&gt;&lt;del&gt;LU-977&lt;/del&gt;&lt;/a&gt; lod: Patch to protect lqr_start_idx&lt;br/&gt;
Project: fs/lustre-release&lt;br/&gt;
Branch: master&lt;br/&gt;
Current Patch Set: &lt;br/&gt;
Commit: d9b4bc5476c779aaaee6797e5e148b5e0b771980&lt;/p&gt;</comment>
                            <comment id="124835" author="pjones" created="Fri, 21 Aug 2015 22:10:36 +0000"  >&lt;p&gt;Landed for 2.8&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="47324">LU-9780</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="62543">LU-14377</issuekey>
        </issuelink>
            <issuelink>
            <issuekey id="10082">LU-9</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                    <customfield id="customfield_10020" key="com.atlassian.jira.plugin.system.customfieldtypes:float">
                        <customfieldname>Bugzilla ID</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>24194.0</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvlnb:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>7276</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>