<!-- 
RSS generated by JIRA (9.4.14#940014-sha1:734e6822bbf0d45eff9af51f82432957f73aa32c) at Sat Feb 10 01:22:16 UTC 2024

It is possible to restrict the fields that are returned in this document by specifying the 'field' parameter in your request.
For example, to request only the issue key and summary append 'field=key&field=summary' to the URL of your request.
-->
<rss version="0.92" >
<channel>
    <title>Whamcloud Community JIRA</title>
    <link>https://jira.whamcloud.com</link>
    <description>This file is an XML representation of an issue</description>
    <language>en-us</language>    <build-info>
        <version>9.4.14</version>
        <build-number>940014</build-number>
        <build-date>05-12-2023</build-date>
    </build-info>


<item>
            <title>[LU-2089] BUG: Bad page state in process ll_ost_io01_039  pfn:63ae41</title>
                <link>https://jira.whamcloud.com/browse/LU-2089</link>
                <project id="10000" key="LU">Lustre</project>
                    <description>&lt;p&gt;I see many messages on the console of our OSTs running 2.3.51-3chaos:&lt;/p&gt;
&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;2012-10-04 03:49:04 BUG: Bad page state in process ll_ost_io01_039  pfn:63ae41
2012-10-04 03:49:04 page:ffffea0015ce1e38 flags:0040000000000080 count:0 mapcount:0 mapping:(null) index:0 (Tainted: P    B      ----------------  )
2012-10-04 03:49:04 Pid: 7115, comm: ll_ost_io01_039 Tainted: P    B      ----------------   2.6.32-220.23.1.1chaos.ch5.x86_64 #1
2012-10-04 03:49:04 Call Trace:
2012-10-04 03:49:04  [&amp;lt;ffffffff81121507&amp;gt;] ? bad_page+0x107/0x160
2012-10-04 03:49:04  [&amp;lt;ffffffff81124599&amp;gt;] ? free_hot_cold_page+0x1c9/0x220
2012-10-04 03:49:04  [&amp;lt;ffffffff811246af&amp;gt;] ? free_hot_page+0x2f/0x60
2012-10-04 03:49:04  [&amp;lt;ffffffff811275de&amp;gt;] ? __put_single_page+0x1e/0x30
2012-10-04 03:49:04  [&amp;lt;ffffffff81127755&amp;gt;] ? put_page+0x25/0x40
2012-10-04 03:49:04  [&amp;lt;ffffffffa086ff38&amp;gt;] ? ptlrpc_free_bulk+0x98/0x330 [ptlrpc]
2012-10-04 03:49:04  [&amp;lt;ffffffffa0d68e01&amp;gt;] ? ost_brw_write+0x501/0x15e0 [ost]
2012-10-04 03:49:04  [&amp;lt;ffffffffa08443c0&amp;gt;] ? target_bulk_timeout+0x0/0xc0 [ptlrpc]
2012-10-04 03:49:04  [&amp;lt;ffffffffa0d6f4d2&amp;gt;] ? ost_handle+0x32e2/0x4690 [ost]
2012-10-04 03:49:04  [&amp;lt;ffffffffa088b39b&amp;gt;] ? ptlrpc_update_export_timer+0x4b/0x470 [ptlrpc]
2012-10-04 03:49:04  [&amp;lt;ffffffffa08937fc&amp;gt;] ? ptlrpc_server_handle_request+0x41c/0xe00 [ptlrpc]
2012-10-04 03:49:04  [&amp;lt;ffffffffa03306be&amp;gt;] ? cfs_timer_arm+0xe/0x10 [libcfs]
2012-10-04 03:49:04  [&amp;lt;ffffffffa034213f&amp;gt;] ? lc_watchdog_touch+0x6f/0x180 [libcfs]
2012-10-04 03:49:04  [&amp;lt;ffffffffa088abb7&amp;gt;] ? ptlrpc_wait_event+0xa7/0x2a0 [ptlrpc]
2012-10-04 03:49:04  [&amp;lt;ffffffff81051ba3&amp;gt;] ? __wake_up+0x53/0x70
2012-10-04 03:49:04  [&amp;lt;ffffffffa0894dd1&amp;gt;] ? ptlrpc_main+0xbf1/0x19e0 [ptlrpc]
2012-10-04 03:49:04  [&amp;lt;ffffffffa08941e0&amp;gt;] ? ptlrpc_main+0x0/0x19e0 [ptlrpc]
2012-10-04 03:49:04  [&amp;lt;ffffffff8100c14a&amp;gt;] ? child_rip+0xa/0x20
2012-10-04 03:49:04  [&amp;lt;ffffffffa08941e0&amp;gt;] ? ptlrpc_main+0x0/0x19e0 [ptlrpc]
2012-10-04 03:49:04  [&amp;lt;ffffffffa08941e0&amp;gt;] ? ptlrpc_main+0x0/0x19e0 [ptlrpc]
2012-10-04 03:49:04  [&amp;lt;ffffffff8100c140&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;

&lt;p&gt;These drown out any other console messages, making it hard to pick out any other error messages.&lt;/p&gt;

&lt;p&gt;It looks very similar to ORI-783, although that bug was triggered by setting &apos;sync_journal=1&apos;, which we are not doing here.&lt;/p&gt;</description>
                <environment>Lustre: 2.3.51-3chaos</environment>
        <key id="16247">LU-2089</key>
            <summary>BUG: Bad page state in process ll_ost_io01_039  pfn:63ae41</summary>
                <type id="1" iconUrl="https://jira.whamcloud.com/secure/viewavatar?size=xsmall&amp;avatarId=11303&amp;avatarType=issuetype">Bug</type>
                                            <priority id="2" iconUrl="https://jira.whamcloud.com/images/icons/priorities/critical.svg">Critical</priority>
                        <status id="5" iconUrl="https://jira.whamcloud.com/images/icons/statuses/resolved.png" description="A resolution has been taken, and it is awaiting verification by reporter. From here issues are either reopened, or are closed.">Resolved</status>
                    <statusCategory id="3" key="done" colorName="success"/>
                                    <resolution id="1">Fixed</resolution>
                                        <assignee username="bzzz">Alex Zhuravlev</assignee>
                                    <reporter username="prakash">Prakash Surya</reporter>
                        <labels>
                            <label>topsequoia</label>
                    </labels>
                <created>Thu, 4 Oct 2012 12:32:56 +0000</created>
                <updated>Fri, 19 Apr 2013 14:55:48 +0000</updated>
                            <resolved>Mon, 8 Oct 2012 01:19:04 +0000</resolved>
                                    <version>Lustre 2.4.0</version>
                                    <fixVersion>Lustre 2.4.0</fixVersion>
                                        <due></due>
                            <votes>0</votes>
                                    <watches>8</watches>
                                                                            <comments>
                            <comment id="45995" author="bzzz" created="Thu, 4 Oct 2012 12:38:28 +0000"  >&lt;p&gt;thanks for the report, looking at this.&lt;/p&gt;</comment>
                            <comment id="45996" author="prakash" created="Thu, 4 Oct 2012 12:39:46 +0000"  >&lt;p&gt;And just in case it&apos;s relevant, these messages were printed just prior to the constant stream of &quot;Bad page state&quot; messages:&lt;/p&gt;

&lt;div class=&quot;preformatted panel&quot; style=&quot;border-width: 1px;&quot;&gt;&lt;div class=&quot;preformattedContent panelContent&quot;&gt;
&lt;pre&gt;
2012-10-03 17:24:57 Lustre: Lustre: Build Version: 2.3.51-3chaos-3chaos--PRISTINE-2.6.32-220.23.1.1chaos.ch5.x86_64
2012-10-03 17:25:21 LustreError: 137-5: UUID &apos;ls1-OST0182_UUID&apos; is not available for connect (no target)
2012-10-03 17:25:21 LustreError: 137-5: UUID &apos;ls1-OST0181_UUID&apos; is not available for connect (no target)
2012-10-03 17:25:44 LustreError: 137-5: UUID &apos;lstest-OST0181_UUID&apos; is not available for connect (no target)
2012-10-03 17:25:46 LustreError: 137-5: UUID &apos;lstest-OST0181_UUID&apos; is not available for connect (no target)
2012-10-03 17:25:46 LustreError: Skipped 3 previous similar messages
2012-10-03 17:25:48 LustreError: 137-5: UUID &apos;lstest-OST0181_UUID&apos; is not available for connect (no target)
2012-10-03 17:25:48 LustreError: Skipped 48 previous similar messages
2012-10-03 17:25:52 LustreError: 137-5: UUID &apos;lstest-OST0181_UUID&apos; is not available for connect (no target)
2012-10-03 17:25:52 LustreError: Skipped 48 previous similar messages
2012-10-03 17:26:00 LustreError: 137-5: UUID &apos;lstest-OST0182_UUID&apos; is not available for connect (no target)
2012-10-03 17:26:00 LustreError: Skipped 24 previous similar messages
2012-10-03 17:26:16 LustreError: 137-5: UUID &apos;lstest-OST0182_UUID&apos; is not available for connect (no target)
2012-10-03 17:26:16 LustreError: Skipped 88 previous similar messages
2012-10-03 17:26:39 LustreError: 6396:0:(mgc_request.c:246:do_config_log_add()) failed processing sptlrpc log: -2
2012-10-03 17:26:39 LustreError: 6517:0:(fsfilt.c:122:fsfilt_get_ops()) Can&apos;t find fsfilt_osd-zfs interface
2012-10-03 17:26:39 LustreError: 6517:0:(filter.c:2324:filter_setup()) lstest-OST0182: filter_common_setup failed: -256.
2012-10-03 17:26:39 LustreError: 6517:0:(obd_config.c:572:class_setup()) setup lstest-OST0182 failed (-256)
2012-10-03 17:26:39 LustreError: 6517:0:(obd_config.c:1545:class_config_llog_handler()) MGC172.20.5.2@o2ib500: cfg command failed: rc = -256
2012-10-03 17:26:39 Lustre:    cmd=cf003 0:lstest-OST0182  1:dev  2:0  3:f  
2012-10-03 17:26:39 LustreError: 15c-8: MGC172.20.5.2@o2ib500: The configuration from log &apos;lstest-OST0182&apos; failed (-256). This may be the result of communication errors between this node and the MGS, a bad configuration, or other errors. See the syslog for more information.
2012-10-03 17:26:39 LustreError: 6396:0:(obd_mount.c:1212:server_start_targets()) failed to start server lstest-OST0182: -256
2012-10-03 17:26:39 Lustre: lstest-OST0182: Unable to start target: -256
2012-10-03 17:26:39 LustreError: 6396:0:(obd_config.c:619:class_cleanup()) Device 3 not setup
2012-10-03 17:26:39 Lustre: server umount lstest-OST0182 complete
2012-10-03 17:26:39 LustreError: 6396:0:(obd_mount.c:2332:lustre_fill_super()) Unable to mount  (-256)
2012-10-03 17:26:40 LustreError: 6568:0:(mgc_request.c:246:do_config_log_add()) failed processing sptlrpc log: -2
2012-10-03 17:26:44 Lustre: lstest-OST0182: Will be in recovery for at least 5:00, or until 256 clients reconnect.
2012-10-03 17:27:01 LustreError: 137-5: UUID &apos;ls1-OST0181_UUID&apos; is not available for connect (no target)
2012-10-03 17:27:01 LustreError: Skipped 406 previous similar messages
2012-10-03 17:27:55 Lustre: lstest-OST0182: Recovery over after 1:11, of 256 clients 256 recovered and 0 were evicted.
2012-10-03 17:27:55 Lustre: 6654:0:(ofd_obd.c:1058:ofd_orphans_destroy()) lstest-OST0182: deleting orphan objects from 137009 to 137287
2012-10-03 17:27:55 LustreError: 6654:0:(ldlm_resource.c:1101:ldlm_resource_get()) lvbo_init failed for resource 137254: rc -2
2012-10-03 17:27:55 LustreError: 6654:0:(ldlm_resource.c:1101:ldlm_resource_get()) lvbo_init failed for resource 137253: rc -2
2012-10-03 17:28:18 LustreError: 137-5: UUID &apos;ls1-OST0181_UUID&apos; is not available for connect (no target)
2012-10-03 17:28:18 LustreError: Skipped 4 previous similar messages
2012-10-03 17:30:48 LustreError: 137-5: UUID &apos;ls1-OST0181_UUID&apos; is not available for connect (no target)
2012-10-03 17:30:48 LustreError: Skipped 12 previous similar messages
2012-10-03 17:35:21 LustreError: 137-5: UUID &apos;ls1-OST0181_UUID&apos; is not available for connect (no target)
2012-10-03 17:35:21 LustreError: Skipped 21 previous similar messages
2012-10-03 17:36:25 BUG: Bad page state in process ll_ost_io02_002  pfn:ff5e41
2012-10-03 17:36:25 page:ffffea0037dc9e38 flags:00c0000000000080 count:0 mapcount:0 mapping:(null) index:0 (Tainted: P           ----------------  )
2012-10-03 17:36:25 Pid: 6629, comm: ll_ost_io02_002 Tainted: P           ----------------   2.6.32-220.23.1.1chaos.ch5.x86_64 #1
2012-10-03 17:36:25 Call Trace:
2012-10-03 17:36:25  [&amp;lt;ffffffff81121507&amp;gt;] ? bad_page+0x107/0x160
2012-10-03 17:36:25  [&amp;lt;ffffffff81124599&amp;gt;] ? free_hot_cold_page+0x1c9/0x220
2012-10-03 17:36:25  [&amp;lt;ffffffff811246af&amp;gt;] ? free_hot_page+0x2f/0x60
2012-10-03 17:36:25  [&amp;lt;ffffffff811275de&amp;gt;] ? __put_single_page+0x1e/0x30
2012-10-03 17:36:25  [&amp;lt;ffffffff81127755&amp;gt;] ? put_page+0x25/0x40
2012-10-03 17:36:25  [&amp;lt;ffffffffa086ff38&amp;gt;] ? ptlrpc_free_bulk+0x98/0x330 [ptlrpc]
2012-10-03 17:36:25  [&amp;lt;ffffffffa0d68e01&amp;gt;] ? ost_brw_write+0x501/0x15e0 [ost]
2012-10-03 17:36:25  [&amp;lt;ffffffffa087c24e&amp;gt;] ? ptlrpc_send_reply+0x28e/0x860 [ptlrpc]
2012-10-03 17:36:25  [&amp;lt;ffffffffa08443c0&amp;gt;] ? target_bulk_timeout+0x0/0xc0 [ptlrpc]
2012-10-03 17:36:25  [&amp;lt;ffffffffa0d6f4d2&amp;gt;] ? ost_handle+0x32e2/0x4690 [ost]
2012-10-03 17:36:25  [&amp;lt;ffffffffa088b39b&amp;gt;] ? ptlrpc_update_export_timer+0x4b/0x470 [ptlrpc]
2012-10-03 17:36:25  [&amp;lt;ffffffffa08937fc&amp;gt;] ? ptlrpc_server_handle_request+0x41c/0xe00 [ptlrpc]
2012-10-03 17:36:25  [&amp;lt;ffffffffa03306be&amp;gt;] ? cfs_timer_arm+0xe/0x10 [libcfs]
2012-10-03 17:36:25  [&amp;lt;ffffffffa034213f&amp;gt;] ? lc_watchdog_touch+0x6f/0x180 [libcfs]
2012-10-03 17:36:25  [&amp;lt;ffffffffa088abb7&amp;gt;] ? ptlrpc_wait_event+0xa7/0x2a0 [ptlrpc]
2012-10-03 17:36:25  [&amp;lt;ffffffff81051ba3&amp;gt;] ? __wake_up+0x53/0x70
2012-10-03 17:36:25  [&amp;lt;ffffffffa0894dd1&amp;gt;] ? ptlrpc_main+0xbf1/0x19e0 [ptlrpc]
2012-10-03 17:36:25  [&amp;lt;ffffffffa08941e0&amp;gt;] ? ptlrpc_main+0x0/0x19e0 [ptlrpc]
2012-10-03 17:36:25  [&amp;lt;ffffffff8100c14a&amp;gt;] ? child_rip+0xa/0x20
2012-10-03 17:36:25  [&amp;lt;ffffffffa08941e0&amp;gt;] ? ptlrpc_main+0x0/0x19e0 [ptlrpc]
2012-10-03 17:36:25  [&amp;lt;ffffffffa08941e0&amp;gt;] ? ptlrpc_main+0x0/0x19e0 [ptlrpc]
2012-10-03 17:36:25  [&amp;lt;ffffffff8100c140&amp;gt;] ? child_rip+0x0/0x20
&lt;/pre&gt;
&lt;/div&gt;&lt;/div&gt;</comment>
                            <comment id="45997" author="pjones" created="Thu, 4 Oct 2012 12:49:21 +0000"  >&lt;p&gt;Alex&lt;/p&gt;

&lt;p&gt;Please could someone look into this one?&lt;/p&gt;

&lt;p&gt;Thanks&lt;/p&gt;

&lt;p&gt;Peter&lt;/p&gt;</comment>
                            <comment id="45998" author="bzzz" created="Thu, 4 Oct 2012 12:59:51 +0000"  >&lt;p&gt;Peter, I&apos;m already.&lt;/p&gt;</comment>
                            <comment id="46041" author="bzzz" created="Fri, 5 Oct 2012 05:58:48 +0000"  >&lt;p&gt;I&apos;m not sure why did this start to happen just now: osd-zfs and ofd are the same as in orion.&lt;/p&gt;

&lt;p&gt;so, the root cause is that arc_buf_alloc() -&amp;gt; kmem_cache_alloc() allocate order of N &amp;gt; 0 from slab:&lt;/p&gt;

&lt;p&gt;data ffff880000440000, size 131072, off 0&lt;br/&gt;
  page:ffff8800062e8e00 flags:0000000000000080 count:1 mapcount:0 mapping:(null)&lt;br/&gt;
  page:ffff8800062e8e38 flags:0000000000000080 count:0 mapcount:0 mapping:(null)&lt;br/&gt;
  page:ffff8800062e8e70 flags:0000000000000080 count:0 mapcount:0 mapping:(null)&lt;br/&gt;
  page:ffff8800062e8ea8 flags:0000000000000080 count:0 mapcount:0 mapping:(null)&lt;br/&gt;
  page:ffff8800062e8ee0 flags:0000000000000080 count:0 mapcount:0 mapping:(null)&lt;br/&gt;
  page:ffff8800062e8f18 flags:0000000000000080 count:0 mapcount:0 mapping:(null)&lt;br/&gt;
  page:ffff8800062e8f50 flags:0000000000000080 count:0 mapcount:0 mapping:(null)&lt;/p&gt;

&lt;p&gt;page allocator does not increment page&apos;s refcounter unless requested with compound __GFP_COMP.&lt;/p&gt;

&lt;p&gt;ptlrpc_prep_bulk_page() grabs a reference on those pages, then ptlrpc_free_bulk() release them,&lt;br/&gt;
page&apos;s reference goes back to 0 which should not happen.&lt;/p&gt;

&lt;p&gt;I tend to think the right solution is not to touch refcounters from ptlrpc, at least on the server&lt;br/&gt;
and let &lt;del&gt;&amp;gt;dbo_bufs_get()/&lt;/del&gt;&amp;gt;dbo_bufs_put() to make sure pages are pinned properly.&lt;/p&gt;</comment>
                            <comment id="46046" author="bzzz" created="Fri, 5 Oct 2012 08:39:08 +0000"  >&lt;p&gt;&lt;a href=&quot;http://review.whamcloud.com/4198&quot; class=&quot;external-link&quot; target=&quot;_blank&quot; rel=&quot;nofollow noopener&quot;&gt;http://review.whamcloud.com/4198&lt;/a&gt;&lt;/p&gt;

&lt;p&gt;another ideas are welcome.&lt;/p&gt;</comment>
                            <comment id="46072" author="adilger" created="Fri, 5 Oct 2012 18:12:59 +0000"  >&lt;p&gt;To summarize a conversation in Skype, the root of this new problem is because LLNL is running a newer version of ZFS (rc11, AFAIK), while our testing was running an earlier version (rc10, AFAIK).  The newer ZFS is using different slab allocation internally, while the old version is only ever using vmalloc(), which is why we didn&apos;t see this problem in local testing.&lt;/p&gt;</comment>
                            <comment id="46115" author="ian" created="Mon, 8 Oct 2012 01:19:04 +0000"  >&lt;p&gt;4198 patch landed to master.&lt;/p&gt;</comment>
                    </comments>
                <issuelinks>
                            <issuelinktype id="10011">
                    <name>Related</name>
                                                                <inwardlinks description="is related to">
                                                        </inwardlinks>
                                    </issuelinktype>
                    </issuelinks>
                <attachments>
                    </attachments>
                <subtasks>
                    </subtasks>
                <customfields>
                                                                                                                                                                                            <customfield id="customfield_10890" key="com.atlassian.jira.plugins.jira-development-integration-plugin:devsummary">
                        <customfieldname>Development</customfieldname>
                        <customfieldvalues>
                            
                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        <customfield id="customfield_10390" key="com.pyxis.greenhopper.jira:gh-lexo-rank">
                        <customfieldname>Rank</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>1|hzv54f:</customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                <customfield id="customfield_10090" key="com.pyxis.greenhopper.jira:gh-global-rank">
                        <customfieldname>Rank (Obsolete)</customfieldname>
                        <customfieldvalues>
                            <customfieldvalue>4363</customfieldvalue>
                        </customfieldvalues>
                    </customfield>
                                                                                            <customfield id="customfield_10060" key="com.atlassian.jira.plugin.system.customfieldtypes:select">
                        <customfieldname>Severity</customfieldname>
                        <customfieldvalues>
                                <customfieldvalue key="10022"><![CDATA[3]]></customfieldvalue>

                        </customfieldvalues>
                    </customfield>
                                                                                                                                                                                                                                                                                                                                                        </customfields>
    </item>
</channel>
</rss>