Print

Print


Julien has attempted to run the "archive and retrieve 1 million files" test and has hit a problem.  Both the EOS mgm and CTA frontend crash at the same time when performing recalls.  The EOS mgm is an SSI client and the CTA frontend is an SSI server.  My current belief is that there is an application bug within the EOS mgm which causes it to crash.  The fact that the CTA frontend crashes in sympathy look like a problem with SSI.  Sorry to be so vague but debugging all of this is very difficult. 

Here is the stack trace from the crashed MGM which is the SSI client (`/var/eos/md/stacktrace`).
```
Thread 307 (Thread 0x7f64dcdfc700 (LWP 16855)):
#0  0x00007f6577de6e3d in nanosleep () from /lib64/libpthread.so.0
#1  0x00007f65713ef834 in sleep_for<long, std::ratio<1l, 1000l> > (
    __rtime=...) at /opt/rh/devtoolset-6/root/usr/include/c++/6.3.1/thread:323
#2  eos::common::ShellCmd::wait (this=this@entry=0x7f64dcddca10, 
    timeout=timeout@entry=120)
    at /usr/src/debug/eos-4.4.44-1/common/ShellCmd.cc:221
#3  0x00007f6571a92080 in eos::common::StackTrace::GdbTrace (
    executable=executable@entry=0x0, pid=<optimized out>, 
    what=what@entry=0x7f6571d416fd "thread apply all bt", 
    file=file@entry=0x7f6571d3dd25 "/var/eos/md/stacktrace", 
    ret_dump=ret_dump@entry=0x0)
    at /usr/src/debug/eos-4.4.44-1/common/StackTrace.hh:81
#4  0x00007f6571a5b160 in xrdmgmofs_stacktrace (sig=11)
    at /usr/src/debug/eos-4.4.44-1/mgm/XrdMgmOfs/Stacktrace.cc:59
#5  <signal handler called>
#6  _M_erase_after (
    this=0x7f6572052d10 <eos::mgm::WFE::Job::SendProtoWFRequest(eos::mgm::WFE::Job*, std::string const&, cta::xrd::Request const&, std::string&, bool)::service+112>, __pos=0x7f644a73a040)
    at /opt/rh/devtoolset-6/root/usr/include/c++/6.3.1/bits/forward_list.tcc:71
#7  erase_after (
    this=0x7f6572052d10 <eos::mgm::WFE::Job::SendProtoWFRequest(eos::mgm::WFE::Job*, std::string const&, cta::xrd::Request const&, std::string&, bool)::service+112>, __pos=...)
    at /opt/rh/devtoolset-6/root/usr/include/c++/6.3.1/bits/forward_list.h:958
#8  XrdSsiPb::ServiceClientSide<cta::xrd::Request, cta::xrd::Response, cta::xrd::Data, cta::xrd::Alert>::cleanup_dead_servers (
    this=this@entry=0x7f6572052ca0 <eos::mgm::WFE::Job::SendProtoWFRequest(eos::mgm::WFE::Job*, std::string const&, cta::xrd::Request const&, std::string&, bool)::service>, max_timeout_s=max_timeout_s@entry=1)
    at /usr/src/debug/eos-4.4.44-1/common/xrootd-ssi-protobuf-interface/include/XrdSsiPbServiceClientSide.hpp:328
#9  0x00007f6571c2cffd in XrdSsiPb::ServiceClientSide<cta::xrd::Request, cta::xrd::Response, cta::xrd::Data, cta::xrd::Alert>::SendAsync (
    this=this@entry=0x7f6572052ca0 <eos::mgm::WFE::Job::SendProtoWFRequest(eos::mgm::WFE::Job*, std::string const&, cta::xrd::Request const&, std::string&, bool)::service>, request=..., response=...)
    at /usr/src/debug/eos-4.4.44-1/common/xrootd-ssi-protobuf-interface/include/XrdSsiPbServiceClientSide.hpp:275
#10 0x00007f6571c1860a in Send (response=..., request=..., 
    this=0x7f6572052ca0 <eos::mgm::WFE::Job::SendProtoWFRequest(eos::mgm::WFE::Job*, std::string const&, cta::xrd::Request const&, std::string&, bool)::service>)
    at /usr/src/debug/eos-4.4.44-1/common/xrootd-ssi-protobuf-interface/include/XrdSsiPbServiceClientSide.hpp:103
#11 eos::mgm::WFE::Job::SendProtoWFRequest (jobPtr=0x7f64dcdde340, 
    jobPtr@entry=0x7f64dcddefc0, 
    fullPath="/eos/ctaeos/preprod/c6f4c0b8-33d5-4ee3-a3b7-661cc8c77894/1/test01079539", request=..., errorMsg=".meta4", retry=retry@entry=false)
    at /usr/src/debug/eos-4.4.44-1/mgm/WFE.cc:2188
#12 0x00007f6571c1c63f in eos::mgm::WFE::Job::HandleProtoMethodPrepareEvent (
    this=0x7f64dcddefc0, this@entry=0x7f64dcdf1070, 
    fullPath="\200\345\302qe\177\000\000`g\302qe\177\000\000\360j\302qe\177\000\000\000\000\000\000\000\000\000\000X)\003re\177\000\000\260\211\302qe\177\000\000 \213\302qe\177\000\000\000\000\000\000\000\000\000\000x)\003re\177\000\000\260c\302qe\177\000\000\240\333\302qe\177\000\000\340\360\302qe\177\000\000\000\346\302qe\177\000\000@c\302qe\177\000\000\320\322\302qe\177\000\000\000\337\302qe\177\000\000Pc\302qe\177\000\000`c\302qe\177\000\000\000\000\000\000\000\000\000\000\220)\003re\177\000\000@d\302qe\177\000\000`d\302qe\177\000\000`f\302qe\177\000\000\000\000\000\000\000\000\000\000\250)\003re\177\000\000"...<Address 0x7f657205a000 out of bounds>, ininfo=0x7f64dcdf16a0 "\350Card\177", 
    ininfo@entry=0x7f64dcdf6c40 "mgm.pcmd=event&mgm.event=sync::prepare&mgm.workflow=default&mgm.fid=0&mgm.path=/eos/ctaeos/preprod/c6f4c0b8-33d5-4ee3-a3b7-661cc8c77894/1/test01079539&mgm.logid=xxxxxxxx-xxxx-xxxx-xxxx-", 'x' <repeats 12 times>, "&mg"..., 
    errorMsg=<error reading variable: Cannot access memory at address 0xffffffffffffe7>) at /usr/src/debug/eos-4.4.44-1/mgm/WFE.cc:1758
#13 0x00007f6571c20357 in eos::mgm::WFE::Job::HandleProtoMethodEvents (
    this=this@entry=0x7f64dcdf1070, 
    errorMsg="trigger workflow - synchronous workflow failed", 
    ininfo=ininfo@entry=0x7f64dcdf6c40 "mgm.pcmd=event&mgm.event=sync::prepare&mgm.workflow=default&mgm.fid=0&mgm.path=/eos/ctaeos/preprod/c6f4c0b8-33d5-4ee3-a3b7-661cc8c77894/1/test01079539&mgm.logid=xxxxxxxx-xxxx-xxxx-xxxx-", 'x' <repeats 12 times>, "&mg"...) at /usr/src/debug/eos-4.4.44-1/mgm/WFE.cc:1608
#14 0x00007f6571c213d1 in eos::mgm::WFE::Job::DoIt (
    this=this@entry=0x7f64dcdf1070, issync=issync@entry=true, 
    errorMsg="trigger workflow - synchronous workflow failed", 
    ininfo=ininfo@entry=0x7f64dcdf6c40 "mgm.pcmd=event&mgm.event=sync::prepare&mgm.workflow=default&mgm.fid=0&mgm.path=/eos/ctaeos/preprod/c6f4c0b8-33d5-4ee3-a3b7-661cc8c77894/1/test01079539&mgm.logid=xxxxxxxx-xxxx-xxxx-xxxx-", 'x' <repeats 12 times>, "&mg"...) at /usr/src/debug/eos-4.4.44-1/mgm/WFE.cc:1543
#15 0x00007f6571c2fbad in eos::mgm::Workflow::ExceptionThrowingCreate (
    this=this@entry=0x7f64dcdf1640, vid=..., 
    ininfo=0x7f64dcdf6c40 "mgm.pcmd=event&mgm.event=sync::prepare&mgm.workflow=default&mgm.fid=0&mgm.path=/eos/ctaeos/preprod/c6f4c0b8-33d5-4ee3-a3b7-661cc8c77894/1/test01079539&mgm.logid=xxxxxxxx-xxxx-xxxx-xxxx-", 'x' <repeats 12 times>, "&mg"..., ininfo@entry=0x7f64dcdf16d0 "\341.", 
    errorMessage="trigger workflow - synchronous workflow failed")
    at /usr/src/debug/eos-4.4.44-1/mgm/Workflow.cc:231
#16 0x00007f6571c3101e in eos::mgm::Workflow::Create (
    this=this@entry=0x7f64dcdf1640, vid=..., 
    ininfo=ininfo@entry=0x7f64dcdf16d0 "\341.", 
    errorMessage="trigger workflow - synchronous workflow failed")
    at /usr/src/debug/eos-4.4.44-1/mgm/Workflow.cc:207
#17 0x00007f6571c317ee in eos::mgm::Workflow::Trigger (
    this=this@entry=0x7f64dcdf1640, event="sync::prepare", 
    workflow="default", vid=..., ininfo=0x7f64dcdf16d0 "\341.", 
    ininfo@entry=0x7f64dcdf6c40 "mgm.pcmd=event&mgm.event=sync::prepare&mgm.workflow=default&mgm.fid=0&mgm.path=/eos/ctaeos/preprod/c6f4c0b8-33d5-4ee3-a3b7-661cc8c77894/1/test01079539&mgm.logid=xxxxxxxx-xxxx-xxxx-xxxx-", 'x' <repeats 12 times>, "&mg"..., 
    errorMessage=<error reading variable: Cannot access memory at address 0x646d63702e6d6755>) at /usr/src/debug/eos-4.4.44-1/mgm/Workflow.cc:96
#18 0x00007f6571abbaf5 in XrdMgmOfs::Event (
    this=0x7f657204d360 <XrdSfsGetFileSystem::myFS>, path=<optimized out>, 
    ininfo=0x7f64dcdf6c40 "mgm.pcmd=event&mgm.event=sync::prepare&mgm.workflow=default&mgm.fid=0&mgm.path=/eos/ctaeos/preprod/c6f4c0b8-33d5-4ee3-a3b7-661cc8c77894/1/test01079539&mgm.logid=xxxxxxxx-xxxx-xxxx-xxxx-", 'x' <repeats 12 times>, "&mg"..., env=..., error=..., vid=..., client=0x7f64dcdfae70)
    at /usr/src/debug/eos-4.4.44-1/mgm/XrdMgmOfs/fsctl/Event.cc:185
#19 0x00007f6571a73401 in XrdMgmOfs::FSctl (
    this=0x7f6577bcf320 <std::string::_Rep::_S_empty_rep_storage>, 
    cmd=<optimized out>, args=..., error=..., client=0x7f64dcdfb490)
    at /usr/src/debug/eos-4.4.44-1/mgm/XrdMgmOfs/Fsctl.cc:281
#20 0x00007f6571a74a97 in XrdMgmOfs::prepare (
    this=0x7f657204d360 <XrdSfsGetFileSystem::myFS>, pargs=..., error=..., 
    client=<optimized out>)
    at /usr/src/debug/eos-4.4.44-1/mgm/XrdMgmOfs.cc:746
#21 0x00007f65784eb90b in XrdXrootdProtocol::do_Prepare (this=0x7f64dc83d600)
    at /usr/src/debug/xrootd/xrootd/src/XrdXrootd/XrdXrootdXeq.cc:1598
#22 0x00007f6578262829 in XrdLink::DoIt (this=0x7f64d772d008)
    at /usr/src/debug/xrootd/xrootd/src/Xrd/XrdLink.cc:441
#23 0x00007f6578265bdf in XrdScheduler::Run (
    this=0x610e78 <XrdMain::Config+440>)
    at /usr/src/debug/xrootd/xrootd/src/Xrd/XrdScheduler.cc:357
#24 0x00007f6578265d29 in XrdStartWorking (carg=<optimized out>)
    at /usr/src/debug/xrootd/xrootd/src/Xrd/XrdScheduler.cc:87
#25 0x00007f657822b927 in XrdSysThread_Xeq (myargs=0x7f644ece57e0)
    at /usr/src/debug/xrootd/xrootd/src/XrdSys/XrdSysPthread.cc:86
#26 0x00007f6577ddfdd5 in start_thread () from /lib64/libpthread.so.0
#27 0x00007f65770e1ead in clone () from /lib64/libc.so.6
```

Here is the stack trace from the crashed CTA frontend which is the SSI server:
```
[root@ctafrontend ~]# gdb -c /var/log/tmp/ctafrontend-1557700613-xrootd-511-11.core
...
Using host libthread_db library "/lib64/libthread_db.so.1".
Core was generated by `/usr/bin/xrootd -l /var/log/cta-frontend-xrootd.log -c /etc/cta/cta-frontend-xr'.
Program terminated with signal 11, Segmentation fault.
#0  0x00007f0e1847349b in raise () from /lib64/libpthread.so.0
Missing separate debuginfos, use: debuginfo-install bzip2-libs-1.0.6-13.el7.x86_64 cryptopp-5.6.2-10.el7.x86_64 cyrus-sasl-lib-2.1.26-23.el7.x86_64 elfutils-libelf-0.172-2.el7.x86_64 elfutils-libs-0.172-2.el7.x86_64 glibc-2.17-260.el7_6.4.x86_64 json-c-0.11-4.el7_0.x86_64 keyutils-libs-1.5.8-3.el7.x86_64 krb5-libs-1.15.1-37.el7_6.x86_64 libaio-0.3.109-13.el7.x86_64 libattr-2.4.46-13.el7.x86_64 libblkid-2.23.2-59.el7_6.1.x86_64 libcap-2.22-9.el7.x86_64 libcom_err-1.42.9-13.el7.x86_64 libgcc-4.8.5-36.el7_6.2.x86_64 libgcrypt-1.5.3-14.el7.x86_64 libgpg-error-1.12-3.el7.x86_64 libibverbs-17.2-3.el7.x86_64 libnl3-3.2.28-4.el7.x86_64 librados2-12.2.2-0.el7.x86_64 libradosstriper1-12.2.2-0.el7.x86_64 libselinux-2.5-14.1.el7.x86_64 libstdc++-4.8.5-36.el7_6.2.x86_64 libuuid-2.23.2-59.el7_6.1.x86_64 libxml2-2.9.1-6.el7_2.3.x86_64 lttng-ust-2.4.1-4.el7.x86_64 lz4-1.7.5-2.el7.x86_64 mariadb-libs-5.5.60-1.el7_5.x86_64 nspr-4.19.0-1.el7_5.x86_64 nss-3.36.0-7.1.el7_6.cern.x86_64 nss-softokn-3.36.0-5.el7_5.x86_64 nss-softokn-freebl-3.36.0-5.el7_5.x86_64 nss-util-3.36.0-1.1.el7_6.x86_64 openldap-2.4.44-21.el7_6.x86_64 openssl-libs-1.0.2k-16.el7_6.1.x86_64 oracle-instantclient12.2-basic-12.2.0.1.0-1.x86_64 pcre-8.32-17.el7.x86_64 postgresql-libs-9.2.24-1.el7_5.x86_64 protobuf3-3.3.1-1.el7.cern.x86_64 sqlite-3.7.17-8.el7.x86_64 systemd-libs-219-62.el7_6.6.x86_64 userspace-rcu-0.7.16-1.el7.x86_64 xz-libs-5.2.2-1.el7.x86_64 zlib-1.2.7-18.el7.x86_64
(gdb) bt
#0  0x00007f0e1847349b in raise () from /lib64/libpthread.so.0
#1  0x00007f0e0a95f582 in skgesigOSCrash () from /usr/lib/oracle/12.2/client64/lib/libclntsh.so.12.1
#2  0x00007f0e0af7f8a5 in kpeDbgSignalHandler () from /usr/lib/oracle/12.2/client64/lib/libclntsh.so.12.1
#3  0x00007f0e0a95f8c0 in skgesig_sigactionHandler () from /usr/lib/oracle/12.2/client64/lib/libclntsh.so.12.1
#4  <signal handler called>
#5  0x00007f0e0ae020c2 in dbgePostErrorDirect () from /usr/lib/oracle/12.2/client64/lib/libclntsh.so.12.1
#6  0x00007f0e0af7f94f in kpeDbgSignalHandler () from /usr/lib/oracle/12.2/client64/lib/libclntsh.so.12.1
#7  0x00007f0e0a95f8c0 in skgesig_sigactionHandler () from /usr/lib/oracle/12.2/client64/lib/libclntsh.so.12.1
#8  <signal handler called>
#9  0x00007f0e176a6207 in raise () from /lib64/libc.so.6
#10 0x00007f0e176a78f8 in abort () from /lib64/libc.so.6
#11 0x00007f0e17fb5765 in __gnu_cxx::__verbose_terminate_handler() () from /lib64/libstdc++.so.6
#12 0x00007f0e17fb3746 in ?? () from /lib64/libstdc++.so.6
#13 0x00007f0e17fb3773 in std::terminate() () from /lib64/libstdc++.so.6
#14 0x00007f0e17fb42df in __cxa_pure_virtual () from /lib64/libstdc++.so.6
#15 0x00007f0e133f7ad8 in XrdSsiFileReq::WakeUp (this=this@entry=0x7f0d44045e00, aP=aP@entry=0x0) at /usr/src/debug/xrootd/xrootd/src/XrdSsi/XrdSsiFileReq.cc:1047
#16 0x00007f0e133f89be in XrdSsiFileReq::ProcessResponse (this=0x7f0d44045e00, eInfo=..., Resp=...) at /usr/src/debug/xrootd/xrootd/src/XrdSsi/XrdSsiFileReq.cc:629
#17 0x00007f0e131dcf49 in XrdSsiResponder::SetResponse (this=0x7f0d7c7f7c10, buff=0x0, blen=0) at /usr/src/debug/xrootd/xrootd/src/XrdSsi/XrdSsiResponder.cc:267
#18 0x00007f0e104c6c13 in XrdSsiResponder::SetNilResponse (this=0x7f0d7c7f7c10) at /usr/include/xrootd/private/XrdSsi/XrdSsiResponder.hh:202
#19 0x00007f0e104da5b4 in XrdSsiPb::RequestProc<cta::xrd::Request, cta::xrd::Response, cta::xrd::Alert>::Execute (this=0x7f0d7c7f7c10) at /usr/src/debug/cta-0-855557gitcc7882cd/xrootd-ssi-protobuf-interface/include/XrdSsiPbRequestProc.hpp:191
#20 0x00007f0e104d90ed in XrdSsiPb::Service<cta::xrd::Request, cta::xrd::Response, cta::xrd::Alert>::ProcessRequest (this=0x1534610, reqRef=..., resRef=...) at /usr/src/debug/cta-0-855557gitcc7882cd/xrootd-ssi-protobuf-interface/include/XrdSsiPbService.hpp:155
#21 0x00007f0e188f1bdf in XrdScheduler::Run (this=0x610e78 <XrdMain::Config+440>) at /usr/src/debug/xrootd/xrootd/src/Xrd/XrdScheduler.cc:357
#22 0x00007f0e188f1d29 in XrdStartWorking (carg=<optimized out>) at /usr/src/debug/xrootd/xrootd/src/Xrd/XrdScheduler.cc:87
#23 0x00007f0e188b7927 in XrdSysThread_Xeq (myargs=0x7f0a8406dcc0) at /usr/src/debug/xrootd/xrootd/src/XrdSys/XrdSysPthread.cc:86
#24 0x00007f0e1846bdd5 in start_thread () from /lib64/libpthread.so.0
#25 0x00007f0e1776dead in clone () from /lib64/libc.so.6
(gdb) 
```

-- 
You are receiving this because you are subscribed to this thread.
Reply to this email directly or view it on GitHub:
https://github.com/xrootd/xrootd/issues/986
########################################################################
Use REPLY-ALL to reply to list

To unsubscribe from the XROOTD-DEV list, click the following link:
https://listserv.slac.stanford.edu/cgi-bin/wa?SUBED1=XROOTD-DEV&A=1