Networker

Re: [Networker] all sessions slow at some time during backup

2012-11-25 04:32:20
Subject: Re: [Networker] all sessions slow at some time during backup
From: Yaron Zabary <yaron AT ARISTO.TAU.AC DOT IL>
To: NETWORKER AT LISTSERV.TEMPLE DOT EDU
Date: Sun, 25 Nov 2012 11:31:30 +0200
With such performace issues, it is useful to run the strace with the -tt flag so you also get timestamp for each call.

However, your comment below about the two different drives, one which runs fast and the other which is stalling, suggest that the fault is with the slow tape drive, media, HBA, SAN switch port.

  How are the tape drives connected to the server ?

Can you point at a common hardware part which always participate in a slow backup ?

  Can you try and disable such hardware and see if that helps ?

On 11/24/2012 09:20 PM, jeronimo wrote:
The server OS is 64-bit Linux running on HP Proliant hardware with lots of 
cores (yeah, overpowered)

I know nothing of a hardware fault. Anyway, what could it be? It more or less 
always behaves that way.

For the details you wanted:

On the server:

/ # ps -ef | grep nsr
root      2496 10671  0 04:02 ?        00:00:30 /opt/nsr/nsrindexd -a ADD hhh 
index for backup on hhh
root      3515 10625  0 18:14 ?        00:00:00 /opt/nsr/nsrmmd -n 6
root      4118 10671  0 18:15 ?        00:00:12 /opt/nsr/nsrindexd -a ADD iii 
index for backup on iii
root     10618     1  0 Nov18 ?        00:02:31 /opt/nsr/nsrexecd 2>&1
root     10625     1  0 Nov18 ?        00:07:29 /opt/nsr/nsrd 2>&1
root     10658     1  0 Nov18 ?        00:23:13 /opt/nsr/nmc/bin/gstd
root     10662     1  0 Nov18 ?        00:00:04 /opt/nsr/nmc/apache/bin/httpd 
-f /opt/nsr/nmc/apache/conf/httpd.conf
root     10667 10625  0 Nov18 ?        00:10:34 /opt/nsr/nsrmmdbd
root     10671 10625  0 Nov18 ?        00:00:22 /opt/nsr/nsrindexd
root     10674 10625  0 Nov18 ?        00:09:03 /opt/nsr/nsrjobd
root     10678 10658  0 Nov18 ?        00:02:30 /opt/nsr/nmc/sybasa/bin/dbsrv9 
-ct- -x tcpip(ServerPort=2638) -ti 1500 -tl 6000 -sb 0 -Q -gk all -o 
/opt/nsr/nmc/logs/db_output -n gst_on_serv /nsr/nmc/lgto_gstdb/lgto_gst.db
root     11476 10625  0 Nov18 ?        00:00:22 /opt/nsr/nsrmmgd
root     11515 10618  0 Nov18 ?        00:00:06 /opt/nsr/nsrlcpd -s serv -N 1 
-n 1
root     11517 10618  0 Nov18 ?        00:00:06 /opt/nsr/nsrlcpd -s serv -N 1 
-n 2
root     11556 10625  3 Nov18 ?        04:53:36 /opt/nsr/nsrmmd -n 1
root     11561 10625  7 Nov18 ?        11:08:43 /opt/nsr/nsrmmd -n 2
root     11566 10625  0 Nov18 ?        01:05:18 /opt/nsr/nsrmmd -n 3
root     11571 10625  0 Nov18 ?        00:00:00 /opt/nsr/nsrmmd -n 4
root     12039 10625  0 07:29 ?        00:00:00 /opt/nsr/savegrp WEEKEND-DEV-DB
root     12456 10671  0 07:38 ?        00:00:16 /opt/nsr/nsrindexd -a ADD xxx 
index for backup on xxx
root     13723 10671  0 07:55 ?        00:00:11 /opt/nsr/nsrindexd -a ADD zzz 
for backup on zzz
root     14068 10671  0 08:03 ?        00:00:00 /opt/nsr/nsrindexd -a ADD zzz 
index for backup on zzz
root     15057 10671  0 08:30 ?        00:00:10 /opt/nsr/nsrindexd -a ADD aaa 
index for backup on aaa
root     15541 10671  0 08:44 ?        00:00:08 /opt/nsr/nsrindexd -a ADD bbb 
index for backup on bbb
root     18646 10671  0 09:48 ?        00:00:00 /opt/nsr/nsrindexd -a ADD ccc 
index for backup on ccc
root     18649 10671  0 09:48 ?        00:00:00 /opt/nsr/nsrindexd -a ADD ddd 
index for backup on ddd
root     22244 10625  0 Nov23 ?        00:00:01 /opt/nsr/savegrp WEEKEND
nobody   27970 10662  0 Nov23 ?        00:00:00 /opt/nsr/nmc/apache/bin/httpd 
-f /opt/nsr/nmc/apache/conf/httpd.conf
nobody   27971 10662  0 Nov23 ?        00:00:00 /opt/nsr/nmc/apache/bin/httpd 
-f /opt/nsr/nmc/apache/conf/httpd.conf
root     31090 10671  0 02:37 ?        00:01:06 /opt/nsr/nsrindexd -a ADD eee 
index for backup on eee
root     31325 10671  0 Nov23 ?        00:00:00 /opt/nsr/nsrindexd -a READ 
nasndmp1 index for all on serv
root     31327 10625  0 Nov23 ?        00:00:00 /opt/nsr/ansrd nasndmp1:root 
recovering
root     32332 10625  0 Nov23 ?        00:00:00 /opt/nsr/nsrmmd -n 5


# strace -p 12456 (nsrindexd)
Process 12456 attached - interrupt to quit
select(64, [9], NULL, NULL, NULL
^C

# strace -p 22244 (savegrp WEEKEND)
Process 22244 attached - interrupt to quit
select(64, [5], NULL, NULL, {16, 836000}) = 0 (Timeout)
select(64, [5], NULL, NULL, {30, 0}
^C


On the client:

# ps -ef | grep nsr
root      5154     1  0 Nov11 ?        00:00:01 /opt/nsr/nsrexecd 2>&1
root     30589  5154  0 07:38 ?        00:02:48 /opt/nsr/savepnpc -s serv -g 
WEEKEND -LL -f - -m host -l full -q -W 78 -N /media /media

# strace -p 30589
Process 30589 attached - interrupt to quit
writev(5, [{"QH\23$\217x\324\205\242\256\326\321g`\317\373L\6\266\36"..., 
32208}], 1) = 32208
read(10, "e<9\240hzY\204\235\274\3173\203\32w\r\t\366\277;j\10#~"..., 65536) = 
65536
writev(5, [{"\200\0\0\0\0\1\0x.\re\270\0\0\0\0\0\0\0\2\0\5\363\330\0"..., 116}, 
{"\0\0\1\0\0\1\0\4\0\0\0\0e<9\240hzY\204\235\274\3173\203"..., 65548}], 2) = 65664
read(10, "H\r\341>H\16V5\330\261\215E~\310\7I\220\377b\317\362\226"..., 65536) 
= 65536
close(10)                               = 0
getuid()                                = 0
setreuid(65534, 4294967295)             = 0
utime("file.pdf", [2012/09/03-15:22:28, 2011/08/10-09:01:20]) = 0
setreuid(0, 4294967295)                 = 0
writev(5, [{"\200\0\0\0\200\0\0x+\re\270\0\0\0\0\0\0\0\2\0\5\363\330"..., 
128}], 1) = 128
lstat("file.zip", {st_mode=S_IFREG|0666, st_size=6853484, ...}) = 0
open("file.zip", O_RDONLY) = 10
fgetxattr(10, "system.posix_acl_access", 0x0, 0) = -1 ENODATA (No data 
available)
fgetxattr(10, "system.posix_acl_default", 0x0, 0) = -1 ENODATA (No data 
available)
flistxattr(10, (nil), 0)                = 17
flistxattr(10, 0x5d4210, 17)            = 17
fgetxattr(10, "netware.metadata", 0x0, 0) = 160
fgetxattr(10, "netware.metadata", 
"\xff\xfe\x01\x00\x00\x00\x00\x00\xef\x03\x00\x1a\x00\x00\x00\x00 
\x00\x00`\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xffS\x12\xd2O\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xc6\x10\xd2O\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xfa\x10\x9eP\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x80\xf5\x8cr\xe4\x8a\xdd\x11\xbd\xd1\x00\x11\x85e0\x0f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\x7f\xff\xff\xff\xff\x00\x00\x00",
 160) = 160
read(10, "PK\3\4\24\0\0\0\10\0z\201:@\357\366#\302P\222h\0=\232%"..., 65536) = 
65536
read(10, "PK\3\4\24\0\0\0\10\0z\201:@\357\366#\302P\222h\0=\232%"..., 65536) = 
65536
writev(5, [{"\200\0\0p*\re\270\0\0\0\0\0\0\0\2\0\5\363\330\0\0\0\315"..., 692}, 
{"\0\0\1\0\0\1\0\4\0\0\0\0PK\3\4\24\0\0\0\10\0z\201:@\357"..., 65548}], 2) = 66240
read(10, "\312\346\276L\27\326\376\24\214\34t=w{\35\345n\370z\264"..., 65536) = 
65536
writev(5, [{"\200\0\0\0\0\1\0x(\re\270\0\0\0\0\0\0\0\2\0\5\363\330\0"..., 116}, 
{"\0\0\1\0\0\1\0\4\0\0\0\0\312\346\276L\27\326\376\24\214"..., 65548}], 2) = 65664
read(10, "\357?\216SA\361\227=\307},\264m\feO(\247+\232Q\251\t\253"..., 65536) 
= 65536

But this is not progressing fast, about one 20 or 30 writev/read every 30 
seconds.....

Currently one of the hosts is backing up alone onto one tape and is really 
going fast.
While the others (about 10 savesets/sessions) are writing to another tape which 
is almost not progressing at all....

+----------------------------------------------------------------------
|This was sent by jm+backupcentral AT roth DOT lu via Backup Central.
|Forward SPAM to abuse AT backupcentral DOT com.
+----------------------------------------------------------------------



--

-- Yaron.