<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:27:45 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-2735] sanity.sh test_151: NOT IN CACHE: before: 337, after: 337 </title>
                <link>https://jira.whamcloud.com/browse/LU-2735</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;This issue was created by maloo for Li Wei &amp;lt;liwei@whamcloud.com&amp;gt;&lt;/p&gt;

&lt;p&gt;This issue relates to the following test suite run: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/b8f32682-6c5f-11e2-91d6-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/b8f32682-6c5f-11e2-91d6-52540035b04c&lt;/a&gt;.&lt;/p&gt;

&lt;p&gt;The sub-test test_151 failed with the following error:&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;NOT IN CACHE: before: 337, after: 337&lt;/p&gt;&lt;/blockquote&gt;

&lt;p&gt;Info required for matching: sanity 151&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;== sanity test 151: test cache on oss and controls ================================= 21:28:47 (1359696527)
CMD: client-21-ib /usr/sbin/lctl get_param -n obdfilter.lustre-OST*.read_cache_enable 		osd-*.lustre-OST*.read_cache_enable 2&amp;gt;&amp;amp;1
CMD: client-21-ib /usr/sbin/lctl get_param -n obdfilter.lustre-OST*.read_cache_enable 		osd-*.lustre-OST*.read_cache_enable 2&amp;gt;&amp;amp;1
CMD: client-21-ib /usr/sbin/lctl set_param -n obdfilter.lustre-OST*.writethrough_cache_enable=1 		osd-*.lustre-OST*.writethrough_cache_enable=1 2&amp;gt;&amp;amp;1
3+0 records in
3+0 records out
12288 bytes (12 kB) copied, 0.00445821 s, 2.8 MB/s
CMD: client-21-ib /usr/sbin/lctl get_param -n obdfilter.lustre-OST*.stats 		osd-*.lustre-OST*.stats 2&amp;gt;&amp;amp;1
CMD: client-21-ib /usr/sbin/lctl get_param -n obdfilter.lustre-OST*.stats 		osd-*.lustre-OST*.stats 2&amp;gt;&amp;amp;1
 sanity test_151: @@@@@@ FAIL: NOT IN CACHE: before: 337, after: 337 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</description>
                <environment></environment>
        <key id="17404">LU-2735</key>
            <summary>sanity.sh test_151: NOT IN CACHE: before: 337, after: 337 </summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="4" iconUrl="https://jira.whamcloud.com/images/icons/priorities/minor.svg">Minor</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="keith">Keith Mannthey</assignee>
                                    <reporter username="maloo">Maloo</reporter>
                        <labels>
                    </labels>
                <created>Fri, 1 Feb 2013 09:25:59 +0000</created>
                <updated>Fri, 21 Jun 2013 23:43:03 +0000</updated>
                            <resolved>Fri, 21 Jun 2013 23:43:03 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                                    <fixVersion>Lustre 2.4.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>10</watches>
                                                                            <comments>
                            <comment id="52625" author="hongchao.zhang" created="Mon, 18 Feb 2013 09:40:26 +0000"  >&lt;p&gt;in the debug logs in OST, the read requests are indeed sent to the OST from client, then the possible reason of this issue could be the pages just dropped&lt;br/&gt;
between the write and read operation, will create a debug patch to test it.&lt;/p&gt;</comment>
                            <comment id="52738" author="hongchao.zhang" created="Wed, 20 Feb 2013 05:43:26 +0000"  >&lt;p&gt;the patch is tracked at &lt;a href=&quot;http://review.whamcloud.com/#change,5475&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/#change,5475&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="52780" author="yong.fan" created="Wed, 20 Feb 2013 21:42:05 +0000"  >&lt;p&gt;Another failure instance:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/91aa7cb6-7ad7-11e2-b916-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/91aa7cb6-7ad7-11e2-b916-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="53495" author="keith" created="Wed, 6 Mar 2013 22:45:56 +0000"  >&lt;p&gt;Another instance:  It reported 5 out of the last 100 had failed. &lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/1a87288c-85f4-11e2-9f8d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/1a87288c-85f4-11e2-9f8d-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="53645" author="yujian" created="Sat, 9 Mar 2013 08:09:36 +0000"  >&lt;p&gt;Another one: &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/ec2cde8e-8885-11e2-b643-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/ec2cde8e-8885-11e2-b643-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="54028" author="jlevi" created="Thu, 14 Mar 2013 12:56:01 +0000"  >&lt;p&gt;Patch landed to master and confirmed with Yu Jian, this ticket can be closed.&lt;/p&gt;</comment>
                            <comment id="54029" author="jlevi" created="Thu, 14 Mar 2013 12:56:19 +0000"  >&lt;p&gt;Patch landed, and confirmed with Yu Jian, this ticket can be closed.&lt;/p&gt;</comment>
                            <comment id="54041" author="keith" created="Thu, 14 Mar 2013 14:36:44 +0000"  >&lt;p&gt;Sorry to have to reopen this issue but I think we need to be really clear about what the situation is.  &lt;/p&gt;

&lt;p&gt;The patch creates special Lustre behavior (accessed by setting a fail_loc value) to conform Lustre to the desired test behavior.&lt;/p&gt;

&lt;p&gt;I don&apos;t expect fail_loc to be used outside of test. &lt;/p&gt;

&lt;p&gt;I argue this is not a fix and that either the test needs to be changed with the understanding that cache behavior can be non-deterministic or the cache needs to conform to the behavior expected under all conditions.&lt;/p&gt;

&lt;p&gt;This patch is a great debug point but I don&apos;t think it is a solution to the core issue.  I happen to be working on &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2902&quot; title=&quot;sanity test_156: NOT IN CACHE: before: , after: &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2902&quot;&gt;&lt;del&gt;LU-2902&lt;/del&gt;&lt;/a&gt; that is basically this issue.  &lt;/p&gt;

&lt;p&gt;Also perhaps this is not a blocking issue and I will not argue if this is closed again. &lt;/p&gt;</comment>
                            <comment id="54086" author="green" created="Fri, 15 Mar 2013 02:20:37 +0000"  >&lt;p&gt;well, I agree it&apos;s sort of a hack to disable cache pruging for the test.&lt;br/&gt;
But what other options do we have? not much.&lt;/p&gt;

&lt;p&gt;Another one I can think of is we can somehow ensure there&apos;s a lot of free ram, so no mem pressure. (e.g. tail /dev/zero or other such memhog that then terminates and leaves a lot of free ram behind).&lt;/p&gt;</comment>
                            <comment id="54126" author="keith" created="Fri, 15 Mar 2013 14:18:42 +0000"  >&lt;p&gt;I agree this is a touchy problem but it is good to test this behavior when conditions are correct. &lt;/p&gt;

&lt;p&gt;A memhog allocation might oom the box if we are under enough memory pressure to evict the last small write from the page cache in short order. &lt;/p&gt;

&lt;p&gt;I am going to propose a patch that attempts to detect when we are dropped from the cache under memory pressure. It can&apos;t be a 100% thing (without a hard cache_drop proc stat) but it could be a guide as to when to skip the test. &lt;/p&gt;

</comment>
                            <comment id="54154" author="pjones" created="Fri, 15 Mar 2013 19:49:24 +0000"  >&lt;p&gt;ok then I am dropping the priority and reassigning to Keith for the extra work he is proposing. Alternatively this could have been covered under a new enhancement ticket.&lt;/p&gt;</comment>
                            <comment id="54236" author="adilger" created="Mon, 18 Mar 2013 07:37:43 +0000"  >&lt;p&gt;This seems closely related, if not a duplicate of &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2848&quot; title=&quot;Failure on test suite sanity test_151: NOT IN CACHE before: , after: &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2848&quot;&gt;&lt;del&gt;LU-2848&lt;/del&gt;&lt;/a&gt;?  Can this be closed as a duplicate, or are they for separate issues?&lt;/p&gt;</comment>
                            <comment id="54237" author="adilger" created="Mon, 18 Mar 2013 07:38:31 +0000"  >&lt;p&gt;The reason I ask is because this test is still failing several times a week.&lt;/p&gt;</comment>
                            <comment id="54268" author="keith" created="Mon, 18 Mar 2013 16:46:30 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2848&quot; title=&quot;Failure on test suite sanity test_151: NOT IN CACHE before: , after: &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2848&quot;&gt;&lt;del&gt;LU-2848&lt;/del&gt;&lt;/a&gt; is  the &quot; NOT IN CACHE before: , after: &quot; behavior.  I don&apos;t think that that issue is cased by items being dropped from the cache (that is what the landed fail_loc patch protects against).  In my testing for &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2902&quot; title=&quot;sanity test_156: NOT IN CACHE: before: , after: &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2902&quot;&gt;&lt;del&gt;LU-2902&lt;/del&gt;&lt;/a&gt; I was unable to reproduce the issue outside of the OOM killer killing ssh between the nodes.   There is still another issue out there in the roc_hit family of tests.  I have submitted a patch to see what roc_hit is getting from /proc (as part of lu-2902), in order to see what it sees. &lt;/p&gt;</comment>
                            <comment id="55507" author="keith" created="Thu, 4 Apr 2013 17:08:40 +0000"  >&lt;p&gt;ok new update:&lt;/p&gt;

&lt;p&gt;&lt;a href=&quot;https://maloo.whamcloud.com/test_sets/c11cebd6-9cde-11e2-8b27-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/c11cebd6-9cde-11e2-8b27-52540035b04c&lt;/a&gt;&lt;br/&gt;
test_151 	&lt;/p&gt;

&lt;p&gt;    Error: &apos;NOT IN CACHE: before: 349, after: 349&apos;&lt;br/&gt;
    Failure Rate: 7.00% of last 100 executions &lt;span class=&quot;error&quot;&gt;&amp;#91;all branches&amp;#93;&lt;/span&gt; &lt;/p&gt;

&lt;p&gt;There are still null errors out there. &lt;/p&gt;

&lt;p&gt;The is with should be with the fail_loc fix in place. &lt;br/&gt;
From the test log:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;fail_loc=0x609
3+0 records in
3+0 records out
12288 bytes (12 kB) copied, 0.00473753 s, 2.6 MB/s
CMD: client-21-ib /usr/sbin/lctl get_param -n obdfilter.lustre-OST*.stats 		osd-*.lustre-OST*.stats 2&amp;gt;&amp;amp;1
snapshot_time 1365007767.342119 secs.usecs read_bytes 4 samples [bytes] 4096 1048576 1380352 write_bytes 3 samples [bytes] 327680 1048576 1900544 get_info 81 samples [reqs] connect 1 samples [reqs] disconnect 1 samples [reqs] statfs 35 samples [reqs] create 2 samples [reqs] destroy 76 samples [reqs] punch 1 samples [reqs] sync 3 samples [reqs] preprw 7 samples [reqs] commitrw 7 samples [reqs] ping 10 samples [reqs] snapshot_time 1365007767.342180 secs.usecs write_bytes 1 samples [bytes] 50400 50400 50400 get_info 76 samples [reqs] connect 1 samples [reqs] disconnect 1 samples [reqs] statfs 34 samples [reqs] create 2 samples [reqs] destroy 73 samples [reqs] sync 1 samples [reqs] preprw 1 samples [reqs] commitrw 1 samples [reqs] ping 11 samples [reqs] snapshot_time 1365007767.342212 secs.usecs read_bytes 7 samples [bytes] 4096 8192 49152 write_bytes 3 samples [bytes] 1910 6096 9922 get_info 85 samples [reqs] connect 1 samples [reqs] disconnect 1 samples [reqs] statfs 34 samples [reqs] create 2 samples [reqs] destroy 73 samples [reqs] punch 1 samples [reqs] sync 4 samples [reqs] preprw 10 samples [reqs] commitrw 10 samples [reqs] ping 12 samples [reqs] snapshot_time 1365007767.342242 secs.usecs get_info 73 samples [reqs] connect 1 samples [reqs] disconnect 1 samples [reqs] statfs 34 samples [reqs] create 2 samples [reqs] destroy 73 samples [reqs] ping 12 samples [reqs] snapshot_time 1365007767.342268 secs.usecs get_info 73 samples [reqs] connect 1 samples [reqs] disconnect 1 samples [reqs] statfs 34 samples [reqs] create 2 samples [reqs] destroy 73 samples [reqs] ping 17 samples [reqs] snapshot_time 1365007767.342293 secs.usecs get_info 73 samples [reqs] connect 1 samples [reqs] disconnect 1 samples [reqs] statfs 34 samples [reqs] create 2 samples [reqs] destroy 73 samples [reqs] ping 12 samples [reqs] snapshot_time 1365007767.342324 secs.usecs get_info 72 samples [reqs] connect 1 samples [reqs] disconnect 1 samples [reqs] statfs 34 samples [reqs] create 2 samples [reqs] destroy 73 samples [reqs] ping 12 samples [reqs] snapshot_time 1365007767.342552 secs.usecs get_page 7 samples [usec] 1 11 24 162 cache_access 337 samples [pages] 1 1 337 cache_hit 337 samples [pages] 1 1 337 snapshot_time 1365007767.342582 secs.usecs get_page 1 samples [usec] 1 1 1 1 snapshot_time 1365007767.342599 secs.usecs get_page 10 samples [usec] 0 1 3 3 cache_access 12 samples [pages] 1 1 12 cache_hit 12 samples [pages] 1 1 12
CMD: client-21-ib /usr/sbin/lctl get_param -n obdfilter.lustre-OST*.stats 		osd-*.lustre-OST*.stats 2&amp;gt;&amp;amp;1
CMD: client-21-ib /usr/sbin/lctl get_param -n obdfilter.lustre-OST*.stats 		osd-*.lustre-OST*.stats 2&amp;gt;&amp;amp;1
snapshot_time 1365007768.89231 secs.usecs read_bytes 4 samples [bytes] 4096 1048576 1380352 write_bytes 3 samples [bytes] 327680 1048576 1900544 get_info 81 samples [reqs] connect 1 samples [reqs] disconnect 1 samples [reqs] statfs 36 samples [reqs] create 2 samples [reqs] destroy 76 samples [reqs] punch 1 samples [reqs] sync 3 samples [reqs] preprw 7 samples [reqs] commitrw 7 samples [reqs] ping 10 samples [reqs] snapshot_time 1365007768.89292 secs.usecs write_bytes 1 samples [bytes] 50400 50400 50400 get_info 76 samples [reqs] connect 1 samples [reqs] disconnect 1 samples [reqs] statfs 35 samples [reqs] create 2 samples [reqs] destroy 73 samples [reqs] sync 1 samples [reqs] preprw 1 samples [reqs] commitrw 1 samples [reqs] ping 11 samples [reqs] snapshot_time 1365007768.89323 secs.usecs read_bytes 7 samples [bytes] 4096 8192 49152 write_bytes 3 samples [bytes] 1910 6096 9922 get_info 86 samples [reqs] connect 1 samples [reqs] disconnect 1 samples [reqs] statfs 35 samples [reqs] create 2 samples [reqs] destroy 73 samples [reqs] punch 1 samples [reqs] sync 4 samples [reqs] preprw 10 samples [reqs] commitrw 10 samples [reqs] ping 12 samples [reqs] snapshot_time 1365007768.89353 secs.usecs read_bytes 2 samples [bytes] 4096 12288 16384 write_bytes 1 samples [bytes] 12288 12288 12288 get_info 74 samples [reqs] connect 1 samples [reqs] disconnect 1 samples [reqs] statfs 35 samples [reqs] create 2 samples [reqs] destroy 73 samples [reqs] sync 1 samples [reqs] preprw 3 samples [reqs] commitrw 3 samples [reqs] ping 12 samples [reqs] snapshot_time 1365007768.89382 secs.usecs get_info 73 samples [reqs] connect 1 samples [reqs] disconnect 1 samples [reqs] statfs 34 samples [reqs] create 2 samples [reqs] destroy 73 samples [reqs] ping 17 samples [reqs] snapshot_time 1365007768.89413 secs.usecs get_info 73 samples [reqs] connect 1 samples [reqs] disconnect 1 samples [reqs] statfs 34 samples [reqs] create 2 samples [reqs] destroy 73 samples [reqs] ping 12 samples [reqs] snapshot_time 1365007768.89447 secs.usecs get_info 72 samples [reqs] connect 1 samples [reqs] disconnect 1 samples [reqs] statfs 34 samples [reqs] create 2 samples [reqs] destroy 73 samples [reqs] ping 12 samples [reqs] snapshot_time 1365007768.89653 secs.usecs get_page 7 samples [usec] 1 11 24 162 cache_access 337 samples [pages] 1 1 337 cache_hit 337 samples [pages] 1 1 337 snapshot_time 1365007768.89681 secs.usecs get_page 1 samples [usec] 1 1 1 1 snapshot_time 1365007768.89697 secs.usecs get_page 10 samples [usec] 0 1 3 3 cache_access 12 samples [pages] 1 1 12 cache_hit 12 samples [pages] 1 1 12
CMD: client-21-ib /usr/sbin/lctl get_param -n obdfilter.lustre-OST*.stats 		osd-*.lustre-OST*.stats 2&amp;gt;&amp;amp;1
CMD: client-21-ib /usr/sbin/lctl set_param fail_loc=0
fail_loc=0
 sanity test_151: @@@@@@ FAIL: NOT IN CACHE: before: 349, after: 349 
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;It seems we are back to square one. &lt;/p&gt;</comment>
                            <comment id="56113" author="green" created="Thu, 11 Apr 2013 17:04:50 +0000"  >&lt;p&gt;happened again &lt;a href=&quot;https://maloo.whamcloud.com/test_sets/f0607d16-a285-11e2-81ba-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_sets/f0607d16-a285-11e2-81ba-52540035b04c&lt;/a&gt;&lt;/p&gt;</comment>
                            <comment id="56275" author="adilger" created="Mon, 15 Apr 2013 00:03:33 +0000"  >&lt;p&gt;Not sure if this is a spurious observation or not, but it appears that most of these recent failures are on client-23-ib (i.e. running IB networking) instead of of TCP networking.  At least it appears that there was a decrease in the number of this style of failure when running on IB.&lt;/p&gt;</comment>
                            <comment id="56276" author="adilger" created="Mon, 15 Apr 2013 00:04:26 +0000"  >&lt;p&gt;Not sure if this is a spurious observation or not, but it appears that most of these recent failures are on client-23-ib (i.e. running IB networking) instead of of TCP networking.  At least it appears that there was a decrease in the number of this style of failure when running on IB.&lt;/p&gt;</comment>
                            <comment id="56277" author="adilger" created="Mon, 15 Apr 2013 00:07:52 +0000"  >&lt;p&gt;There are no more &lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2735&quot; title=&quot;sanity.sh test_151: NOT IN CACHE: before: 337, after: 337 &quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2735&quot;&gt;&lt;del&gt;LU-2735&lt;/del&gt;&lt;/a&gt; failures reported for TCP since 2013-03-31 (&lt;a href=&quot;https://maloo.whamcloud.com/sub_tests/4b048dda-9bd2-11e2-be58-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/sub_tests/4b048dda-9bd2-11e2-be58-52540035b04c&lt;/a&gt;), but there have been 7 failures on client-23-ib since that time.&lt;/p&gt;</comment>
                            <comment id="56278" author="adilger" created="Mon, 15 Apr 2013 00:22:08 +0000"  >&lt;p&gt;To compare these numbers, IB has had 7/11 runs fail since 2013-03-31, while TCP has had about 350 passes without hitting this specific failure mode (for &quot;review&quot;, though &quot;review-dne&quot; is failing for potentially another reason).&lt;/p&gt;

&lt;p&gt;This might just relate to a race condition based on the speed of the networking, or it might relate to the fact that IB testing is run on separate physical nodes, while TCP testing is run on VMs on a single node.&lt;/p&gt;</comment>
                            <comment id="56339" author="keith" created="Mon, 15 Apr 2013 19:15:04 +0000"  >&lt;p&gt;It known the NOT IN CACHE issues can be caused by memory pressure.  The IB systems could be under more memory pressure. Thanks for spotting the IB issue I have only really been tracking &quot;review&quot; queue with &quot;master&quot; builds.&lt;/p&gt;</comment>
                            <comment id="57057" author="keith" created="Thu, 25 Apr 2013 17:02:35 +0000"  >&lt;p&gt;From a client ib test failure a &quot;before&quot; debug output : &lt;a href=&quot;https://maloo.whamcloud.com/test_logs/c0ac67cc-ad68-11e2-b72d-52540035b04c&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;https://maloo.whamcloud.com/test_logs/c0ac67cc-ad68-11e2-b72d-52540035b04c&lt;/a&gt;&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;snapshot_time 1366819124.688743 secs.usecs read_bytes 6 samples [bytes] 4096 1048576 1396736 write_bytes 6 samples [bytes] 777 1048576 1909333 get_info 87 samples [reqs] connect 1 samples [reqs] disconnect 1 samples [reqs] statfs 59 samples [reqs] create 2 samples [reqs] destroy 77 samples [reqs] setattr 2 samples [reqs] punch 4 samples [reqs] sync 6 samples [reqs] preprw 12 samples [reqs] commitrw 12 samples [reqs] ping 41 samples [reqs]

snapshot_time 1366819124.688804 secs.usecs read_bytes 130 samples [bytes] 8192 1048576 134234112 write_bytes 130 samples [bytes] 1916 1048576 134225740 get_info 96 samples [reqs] connect 1 samples [reqs] disconnect 1 samples [reqs] statfs 58 samples [reqs] create 2 samples [reqs] destroy 79 samples [reqs] setattr 1 samples [reqs] punch 5 samples [reqs] sync 6 samples [reqs] preprw 260 samples [reqs] commitrw 260 samples [reqs] ping 38 samples [reqs]

snapshot_time 1366819124.688838 secs.usecs write_bytes 1 samples [bytes] 50400 50400 50400 get_info 76 samples [reqs] connect 1 samples [reqs] disconnect 1 samples [reqs] statfs 58 samples [reqs] create 2 samples [reqs] destroy 73 samples [reqs] setattr 1 samples [reqs] sync 1 samples [reqs] preprw 1 samples [reqs] commitrw 1 samples [reqs] ping 44 samples [reqs]

snapshot_time 1366819124.688867 secs.usecs read_bytes 7 samples [bytes] 4096 8192 49152 write_bytes 3 samples [bytes] 1910 6096 9922 get_info 85 samples [reqs] connect 1 samples [reqs] disconnect 1 samples [reqs] statfs 58 samples [reqs] create 2 samples [reqs] destroy 73 samples [reqs] setattr 1 samples [reqs] punch 1 samples [reqs] sync 4 samples [reqs] preprw 10 samples [reqs] commitrw 10 samples [reqs] ping 44 samples [reqs]

snapshot_time 1366819124.688919 secs.usecs read_bytes 6 samples [bytes] 4096 12288 49152 write_bytes 2 samples [bytes] 3019 12288 15307 get_info 78 samples [reqs] connect 1 samples [reqs] disconnect 1 samples [reqs] statfs 58 samples [reqs] create 2 samples [reqs] destroy 75 samples [reqs] punch 1 samples [reqs] sync 2 samples [reqs] preprw 8 samples [reqs] commitrw 8 samples [reqs] ping 42 samples [reqs]

snapshot_time 1366819124.688957 secs.usecs read_bytes 3 samples [bytes] 8192 8192 24576 write_bytes 2 samples [bytes] 6096 6096 12192 get_info 83 samples [reqs] connect 1 samples [reqs] disconnect 1 samples [reqs] statfs 58 samples [reqs] create 2 samples [reqs] destroy 74 samples [reqs] punch 1 samples [reqs] sync 3 samples [reqs] preprw 5 samples [reqs] commitrw 5 samples [reqs] ping 50 samples [reqs]

snapshot_time 1366819124.688991 secs.usecs read_bytes 2 samples [bytes] 8192 8192 16384 write_bytes 3 samples [bytes] 1916 6096 12108 get_info 81 samples [reqs] connect 1 samples [reqs] disconnect 1 samples [reqs] statfs 58 samples [reqs] create 2 samples [reqs] destroy 75 samples [reqs] punch 1 samples [reqs] sync 4 samples [reqs] preprw 5 samples [reqs] commitrw 5 samples [reqs] ping 49 samples [reqs]

snapshot_time 1366819124.689203 secs.usecs get_page 12 samples [usec] 0 10 37 217 cache_access 341 samples [pages] 1 1 341 cache_hit 339 samples [pages] 1 1 339 cache_miss 2 samples [pages] 1 1 2

snapshot_time 1366819124.689231 secs.usecs get_page 260 samples [usec] 1 234 10228 728602 cache_access 32772 samples [pages] 1 1 32772 cache_hit 16384 samples [pages] 1 1 16384 cache_miss 16388 samples [pages] 1 1 16388

snapshot_time 1366819124.689262 secs.usecs get_page 10 samples [usec] 0 1 4 4 cache_access 12 samples [pages] 1 1 12 cache_hit 12 samples [pages] 1 1 12

snapshot_time 1366819124.689280 secs.usecs get_page 8 samples [usec] 0 3 6 14 cache_access 9 samples [pages] 1 1 9 cache_hit 6 samples [pages] 1 1 6 cache_miss 3 samples [pages] 1 1 3

snapshot_time 1366819124.689311 secs.usecs get_page 5 samples [usec] 0 5 11 43 cache_access 4 samples [pages] 1 1 4 cache_hit 2 samples [pages] 1 1 2 cache_miss 2 samples [pages] 1 1 2
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;What is important to note is that there are only 5 proc entires about cache_access.  2 of the ost /proc stats are missing. Also when compared to the &quot;After&quot; results there is no ost that claims the cache_access. This may be just a different form on the no values read from /proc.&lt;/p&gt;

&lt;p&gt;I will peruse further debug work in lu-2902.  The ib clients do seem to help trigger the issue. &lt;/p&gt;</comment>
                            <comment id="59677" author="keith" created="Thu, 30 May 2013 18:29:22 +0000"  >&lt;p&gt;&lt;a href=&quot;https://jira.whamcloud.com/browse/LU-2979&quot; title=&quot;sanity 133a: proc counter for mkdir on mds1 was not incremented&quot; class=&quot;issue-link&quot; data-issue-key=&quot;LU-2979&quot;&gt;&lt;del&gt;LU-2979&lt;/del&gt;&lt;/a&gt; is the root cause of it appears All of these issues.  Since the lu-2970 patch landed we have seen no errors. &lt;/p&gt;

&lt;p&gt;Perhaps &lt;a href=&quot;http://review.whamcloud.com/5475&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/5475&lt;/a&gt; should be reverted?&lt;/p&gt;
</comment>
                            <comment id="59698" author="adilger" created="Thu, 30 May 2013 21:24:19 +0000"  >&lt;p&gt;It looks like 5475 is just adding a test case, and it is never a bad idea to keep the test case after the bug has been fixed.&lt;/p&gt;</comment>
                            <comment id="59709" author="keith" created="Thu, 30 May 2013 23:13:59 +0000"  >&lt;p&gt;I just wanted to be sure we are ok with OBD_FAIL_OBD_NO_LRU and the fail_loc mode change to pass the test. &lt;/p&gt;</comment>
                            <comment id="61059" author="keith" created="Fri, 21 Jun 2013 23:43:03 +0000"  >&lt;p&gt;There has been no sign of this issue in over a month.  The proc changes fixed any remaining errors. &lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                            <outwardlinks description="is related to ">
                                        <issuelink>
            <issuekey id="17655">LU-2848</issuekey>
        </issuelink>
                            </outwardlinks>
                                                                <inwardlinks description="is related to">
                                        <issuelink>
            <issuekey id="17754">LU-2902</issuekey>
        </issuelink>
                            </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzvig7:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>6639</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>