Print

Print


Hi,

      The XrdCnsd in xrootd-20090623 failed to create cns frequestly.
When starting XrdCnsd, it works fine. But when trying it next day, it
will not response. Below is the test result.

It seems the "poll" thread cannot be started for old XrdCnsd(started
more than one day and there is no active in long time(maybe 780s)).
For (*), Process 10816 was an old XrdCnsd(started more than one day)
process which didn't response. After killed it, a new process 13665
was created which worked fine.  From here we can see process 13665 had
a thread "poll"(thread 2) but process 10816 didn't have it.
For (**),  When there was no files copied to xrootd for 10 minutes,
the "poll" thread in Process 13665  disappeared. But when I did a
xrdcp to this server. We will saw this "poll" thread again. But for
process which has on active for long time, the "poll" thread cannot be
restart again.

(*)
[root@pcuwdata12 ~]# ps -ef|grep xrootd-2
xrootd   10798     1  0 Oct02 ?        00:21:21
/opt/xrootd-20090623/bin//xrootd -b -d -R xrootd -l
/tmp/log/xrootd/xrd.log -c /opt/xrootd-20090623/conf/xrdcluster.cfg
xrootd   10809     1  0 Oct02 ?        00:00:05
/opt/xrootd-20090623/bin//cmsd -b -d -R xrootd -l
/tmp/log/xrootd/cms.log -c /opt/xrootd-20090623/conf/xrdcluster.cfg
xrootd   10816 10798  0 Oct02 ?        00:00:00
/opt/xrootd-20090623/bin/XrdCnsd -d -l /tmp/log/xrootd/cnsd.log
root://pcuwsun04.cern.ch:1095
root     13452 13340  0 20:59 pts/1    00:00:00 grep xrootd-2
[root@pcuwdata12 ~]# pstack 10816
Thread 7 (Thread 0x410cf940 (LWP 10826)):
#0  0x000000370e40ae00 in pthread_cond_timedwait@@GLIBC_2.3.2 ()
#1  0x0000000000479707 in XrdSysCondVar::Wait ()
#2  0x000000000047b8e9 in XrdScheduler::TimeSched ()
#3  0x000000000047a21d in XrdStartTSched ()
#4  0x0000000000479637 in XrdSysThread_Xeq ()
#5  0x000000370e4064a7 in start_thread () from /lib64/libpthread.so.0
#6  0x000000370d8d3c2d in clone () from /lib64/libc.so.6
Thread 6 (Thread 0x416ed940 (LWP 10827)):
#0  0x000000370e40c9b1 in sem_wait () from /lib64/libpthread.so.0
#1  0x000000000041ffb7 in XrdSysSemaphore::Wait ()
#2  0x000000000047ae11 in XrdScheduler::Run ()
#3  0x000000000047a241 in XrdStartWorking ()
#4  0x0000000000479637 in XrdSysThread_Xeq ()
#5  0x000000370e4064a7 in start_thread () from /lib64/libpthread.so.0
#6  0x000000370d8d3c2d in clone () from /lib64/libc.so.6
Thread 5 (Thread 0x417ee940 (LWP 10828)):
#0  0x000000370e40c9b1 in sem_wait () from /lib64/libpthread.so.0
#1  0x000000000041ffb7 in XrdSysSemaphore::Wait ()
#2  0x000000000047ae11 in XrdScheduler::Run ()
#3  0x000000000047a241 in XrdStartWorking ()
#4  0x0000000000479637 in XrdSysThread_Xeq ()
#5  0x000000370e4064a7 in start_thread () from /lib64/libpthread.so.0
#6  0x000000370d8d3c2d in clone () from /lib64/libc.so.6
Thread 4 (Thread 0x418ef940 (LWP 10834)):
#0  0x000000370e40d5cb in read () from /lib64/libpthread.so.0
#1  0x00000000004736b9 in XrdOucStream::GetLine ()
#2  0x000000000041eb06 in XrdCnsDaemon::getEvents ()
#3  0x000000000041dcfc in CnsEvents ()
#4  0x0000000000479637 in XrdSysThread_Xeq ()
#5  0x000000370e4064a7 in start_thread () from /lib64/libpthread.so.0
#6  0x000000370d8d3c2d in clone () from /lib64/libc.so.6
Thread 3 (Thread 0x419f0940 (LWP 10835)):
#0  0x000000370e40d174 in __lll_lock_wait () from /lib64/libpthread.so.0
#1  0x000000370e408aca in _L_lock_1034 () from /lib64/libpthread.so.0
#2  0x000000370e40898c in pthread_mutex_lock () from /lib64/libpthread.so.0
#3  0x000000000041ff5b in XrdSysMutex::Lock ()
#4  0x000000000041f6d4 in XrdCnsEvent::Queue ()
#5  0x000000000041ee43 in XrdCnsDaemon::getEvents ()
#6  0x000000000041dcfc in CnsEvents ()
#7  0x0000000000479637 in XrdSysThread_Xeq ()
#8  0x000000370e4064a7 in start_thread () from /lib64/libpthread.so.0
#9  0x000000370d8d3c2d in clone () from /lib64/libc.so.6
Thread 2 (Thread 0x41af1940 (LWP 11659)):
#0  0x000000370d89a0b1 in nanosleep () from /lib64/libc.so.6
#1  0x000000370d899ed4 in sleep () from /lib64/libc.so.6
#2  0x000000000044117a in GarbageCollectorThread ()
#3  0x000000000045413a in XrdClientThreadDispatcher ()
#4  0x0000000000479637 in XrdSysThread_Xeq ()
#5  0x000000370e4064a7 in start_thread () from /lib64/libpthread.so.0
#6  0x000000370d8d3c2d in clone () from /lib64/libc.so.6
Thread 1 (Thread 0x2b53e8698db0 (LWP 10816)):
#0  0x000000370e40c9b1 in sem_wait () from /lib64/libpthread.so.0
#1  0x000000000041ffb7 in XrdSysSemaphore::Wait ()
#2  0x000000000041f8bd in XrdCnsEvent::Remove ()
#3  0x000000000041ea05 in XrdCnsDaemon::doRequests ()
#4  0x00000000004202d1 in main ()
[root@pcuwdata12 ~]# kill 10816
[root@pcuwdata12 ~]# kill 10816
-bash: kill: (10816) - No such process
[root@pcuwdata12 ~]# ps -ef|grep xrootd-2
xrootd   10798     1  0 Oct02 ?        00:21:21
/opt/xrootd-20090623/bin//xrootd -b -d -R xrootd -l
/tmp/log/xrootd/xrd.log -c /opt/xrootd-20090623/conf/xrdcluster.cfg
xrootd   10809     1  0 Oct02 ?        00:00:05
/opt/xrootd-20090623/bin//cmsd -b -d -R xrootd -l
/tmp/log/xrootd/cms.log -c /opt/xrootd-20090623/conf/xrdcluster.cfg
xrootd   13665 10798  0 21:03 ?        00:00:00
/opt/xrootd-20090623/bin/XrdCnsd -d -l /tmp/log/xrootd/cnsd.log
root://pcuwsun04.cern.ch:1095
root     13678 13340  0 21:03 pts/1    00:00:00 grep xrootd-2
[root@pcuwdata12 ~]# pstack 13665
Thread 8 (Thread 0x4074b940 (LWP 13666)):
#0  0x000000370e40ae00 in pthread_cond_timedwait@@GLIBC_2.3.2 ()
#1  0x0000000000479707 in XrdSysCondVar::Wait ()
#2  0x000000000047b8e9 in XrdScheduler::TimeSched ()
#3  0x000000000047a21d in XrdStartTSched ()
#4  0x0000000000479637 in XrdSysThread_Xeq ()
#5  0x000000370e4064a7 in start_thread () from /lib64/libpthread.so.0
#6  0x000000370d8d3c2d in clone () from /lib64/libc.so.6
Thread 7 (Thread 0x41b0f940 (LWP 13667)):
#0  0x000000370e40c9b1 in sem_wait () from /lib64/libpthread.so.0
#1  0x000000000041ffb7 in XrdSysSemaphore::Wait ()
#2  0x000000000047ae11 in XrdScheduler::Run ()
#3  0x000000000047a241 in XrdStartWorking ()
#4  0x0000000000479637 in XrdSysThread_Xeq ()
#5  0x000000370e4064a7 in start_thread () from /lib64/libpthread.so.0
#6  0x000000370d8d3c2d in clone () from /lib64/libc.so.6
Thread 6 (Thread 0x41d8c940 (LWP 13668)):
#0  0x000000370e40c9b1 in sem_wait () from /lib64/libpthread.so.0
#1  0x000000000041ffb7 in XrdSysSemaphore::Wait ()
#2  0x000000000047ae11 in XrdScheduler::Run ()
#3  0x000000000047a241 in XrdStartWorking ()
#4  0x0000000000479637 in XrdSysThread_Xeq ()
#5  0x000000370e4064a7 in start_thread () from /lib64/libpthread.so.0
#6  0x000000370d8d3c2d in clone () from /lib64/libc.so.6
Thread 5 (Thread 0x41e8d940 (LWP 13669)):
#0  0x000000370e40d5cb in read () from /lib64/libpthread.so.0
#1  0x00000000004736b9 in XrdOucStream::GetLine ()
#2  0x000000000041eb06 in XrdCnsDaemon::getEvents ()
#3  0x000000000041dcfc in CnsEvents ()
#4  0x0000000000479637 in XrdSysThread_Xeq ()
#5  0x000000370e4064a7 in start_thread () from /lib64/libpthread.so.0
#6  0x000000370d8d3c2d in clone () from /lib64/libc.so.6
Thread 4 (Thread 0x41f8e940 (LWP 13670)):
#0  0x000000370e40d5cb in read () from /lib64/libpthread.so.0
#1  0x00000000004736b9 in XrdOucStream::GetLine ()
#2  0x000000000041eb06 in XrdCnsDaemon::getEvents ()
#3  0x000000000041dcfc in CnsEvents ()
#4  0x0000000000479637 in XrdSysThread_Xeq ()
#5  0x000000370e4064a7 in start_thread () from /lib64/libpthread.so.0
#6  0x000000370d8d3c2d in clone () from /lib64/libc.so.6
Thread 3 (Thread 0x4208f940 (LWP 13671)):
#0  0x000000370d89a0b1 in nanosleep () from /lib64/libc.so.6
#1  0x000000370d899ed4 in sleep () from /lib64/libc.so.6
#2  0x000000000044117a in GarbageCollectorThread ()
#3  0x000000000045413a in XrdClientThreadDispatcher ()
#4  0x0000000000479637 in XrdSysThread_Xeq ()
#5  0x000000370e4064a7 in start_thread () from /lib64/libpthread.so.0
#6  0x000000370d8d3c2d in clone () from /lib64/libc.so.6
Thread 2 (Thread 0x41279940 (LWP 13672)):
#0  0x000000370d8cae46 in poll () from /lib64/libc.so.6
#1  0x000000000046b945 in XrdClientSock::RecvRaw ()
#2  0x000000000044e08c in XrdClientPhyConnection::ReadRaw ()
#3  0x000000000044ac3e in XrdClientMessage::ReadRaw ()
#4  0x000000000044e84c in XrdClientPhyConnection::BuildMessage ()
#5  0x000000000044b730 in SocketReaderThread ()
#6  0x000000000045413a in XrdClientThreadDispatcher ()
#7  0x0000000000479637 in XrdSysThread_Xeq ()
#8  0x000000370e4064a7 in start_thread () from /lib64/libpthread.so.0
#9  0x000000370d8d3c2d in clone () from /lib64/libc.so.6
Thread 1 (Thread 0x2b06bb478db0 (LWP 13665)):
#0  0x000000370e40c9b1 in sem_wait () from /lib64/libpthread.so.0
#1  0x000000000041ffb7 in XrdSysSemaphore::Wait ()
#2  0x000000000041f8bd in XrdCnsEvent::Remove ()
#3  0x000000000041ea05 in XrdCnsDaemon::doRequests ()
#4  0x00000000004202d1 in main ()
[root@pcuwdata12 ~]#
[root@pcuwdata12 ~]#


(***) cnsd.log
091006 20:56:10 1828 XrdCnsSched: running underused thread monitor inq=0
091006 20:56:10 1828 XrdCnsSched: 2 threads; 1 idle
091006 20:56:10 1828 XrdCnsSched: scheduling underused thread monitor
in 780 seconds


Thanks
Wen