Networker

Re: [Networker] all sessions slow at some time during backup

2012-11-25 07:30:37
Subject: Re: [Networker] all sessions slow at some time during backup
From: Yaron Zabary <yaron AT ARISTO.TAU.AC DOT IL>
To: NETWORKER AT LISTSERV.TEMPLE DOT EDU
Date: Sun, 25 Nov 2012 14:30:18 +0200
And another option. Is it possible that the disks you backup are all from the same storage ? Maybe there is some other process which runs on that that storage (SANcopy, SRDF).

On 11/24/2012 09:20 PM, jeronimo wrote:
The server OS is 64-bit Linux running on HP Proliant hardware with lots of 
cores (yeah, overpowered)

I know nothing of a hardware fault. Anyway, what could it be? It more or less 
always behaves that way.

For the details you wanted:

On the server:

/ # ps -ef | grep nsr
root      2496 10671  0 04:02 ?        00:00:30 /opt/nsr/nsrindexd -a ADD hhh 
index for backup on hhh
root      3515 10625  0 18:14 ?        00:00:00 /opt/nsr/nsrmmd -n 6
root      4118 10671  0 18:15 ?        00:00:12 /opt/nsr/nsrindexd -a ADD iii 
index for backup on iii
root     10618     1  0 Nov18 ?        00:02:31 /opt/nsr/nsrexecd 2>&1
root     10625     1  0 Nov18 ?        00:07:29 /opt/nsr/nsrd 2>&1
root     10658     1  0 Nov18 ?        00:23:13 /opt/nsr/nmc/bin/gstd
root     10662     1  0 Nov18 ?        00:00:04 /opt/nsr/nmc/apache/bin/httpd 
-f /opt/nsr/nmc/apache/conf/httpd.conf
root     10667 10625  0 Nov18 ?        00:10:34 /opt/nsr/nsrmmdbd
root     10671 10625  0 Nov18 ?        00:00:22 /opt/nsr/nsrindexd
root     10674 10625  0 Nov18 ?        00:09:03 /opt/nsr/nsrjobd
root     10678 10658  0 Nov18 ?        00:02:30 /opt/nsr/nmc/sybasa/bin/dbsrv9 
-ct- -x tcpip(ServerPort=2638) -ti 1500 -tl 6000 -sb 0 -Q -gk all -o 
/opt/nsr/nmc/logs/db_output -n gst_on_serv /nsr/nmc/lgto_gstdb/lgto_gst.db
root     11476 10625  0 Nov18 ?        00:00:22 /opt/nsr/nsrmmgd
root     11515 10618  0 Nov18 ?        00:00:06 /opt/nsr/nsrlcpd -s serv -N 1 
-n 1
root     11517 10618  0 Nov18 ?        00:00:06 /opt/nsr/nsrlcpd -s serv -N 1 
-n 2
root     11556 10625  3 Nov18 ?        04:53:36 /opt/nsr/nsrmmd -n 1
root     11561 10625  7 Nov18 ?        11:08:43 /opt/nsr/nsrmmd -n 2
root     11566 10625  0 Nov18 ?        01:05:18 /opt/nsr/nsrmmd -n 3
root     11571 10625  0 Nov18 ?        00:00:00 /opt/nsr/nsrmmd -n 4
root     12039 10625  0 07:29 ?        00:00:00 /opt/nsr/savegrp WEEKEND-DEV-DB
root     12456 10671  0 07:38 ?        00:00:16 /opt/nsr/nsrindexd -a ADD xxx 
index for backup on xxx
root     13723 10671  0 07:55 ?        00:00:11 /opt/nsr/nsrindexd -a ADD zzz 
for backup on zzz
root     14068 10671  0 08:03 ?        00:00:00 /opt/nsr/nsrindexd -a ADD zzz 
index for backup on zzz
root     15057 10671  0 08:30 ?        00:00:10 /opt/nsr/nsrindexd -a ADD aaa 
index for backup on aaa
root     15541 10671  0 08:44 ?        00:00:08 /opt/nsr/nsrindexd -a ADD bbb 
index for backup on bbb
root     18646 10671  0 09:48 ?        00:00:00 /opt/nsr/nsrindexd -a ADD ccc 
index for backup on ccc
root     18649 10671  0 09:48 ?        00:00:00 /opt/nsr/nsrindexd -a ADD ddd 
index for backup on ddd
root     22244 10625  0 Nov23 ?        00:00:01 /opt/nsr/savegrp WEEKEND
nobody   27970 10662  0 Nov23 ?        00:00:00 /opt/nsr/nmc/apache/bin/httpd 
-f /opt/nsr/nmc/apache/conf/httpd.conf
nobody   27971 10662  0 Nov23 ?        00:00:00 /opt/nsr/nmc/apache/bin/httpd 
-f /opt/nsr/nmc/apache/conf/httpd.conf
root     31090 10671  0 02:37 ?        00:01:06 /opt/nsr/nsrindexd -a ADD eee 
index for backup on eee
root     31325 10671  0 Nov23 ?        00:00:00 /opt/nsr/nsrindexd -a READ 
nasndmp1 index for all on serv
root     31327 10625  0 Nov23 ?        00:00:00 /opt/nsr/ansrd nasndmp1:root 
recovering
root     32332 10625  0 Nov23 ?        00:00:00 /opt/nsr/nsrmmd -n 5


# strace -p 12456 (nsrindexd)
Process 12456 attached - interrupt to quit
select(64, [9], NULL, NULL, NULL
^C

# strace -p 22244 (savegrp WEEKEND)
Process 22244 attached - interrupt to quit
select(64, [5], NULL, NULL, {16, 836000}) = 0 (Timeout)
select(64, [5], NULL, NULL, {30, 0}
^C


On the client:

# ps -ef | grep nsr
root      5154     1  0 Nov11 ?        00:00:01 /opt/nsr/nsrexecd 2>&1
root     30589  5154  0 07:38 ?        00:02:48 /opt/nsr/savepnpc -s serv -g 
WEEKEND -LL -f - -m host -l full -q -W 78 -N /media /media

# strace -p 30589
Process 30589 attached - interrupt to quit
writev(5, [{"QH\23$\217x\324\205\242\256\326\321g`\317\373L\6\266\36"..., 
32208}], 1) = 32208
read(10, "e<9\240hzY\204\235\274\3173\203\32w\r\t\366\277;j\10#~"..., 65536) = 
65536
writev(5, [{"\200\0\0\0\0\1\0x.\re\270\0\0\0\0\0\0\0\2\0\5\363\330\0"..., 116}, 
{"\0\0\1\0\0\1\0\4\0\0\0\0e<9\240hzY\204\235\274\3173\203"..., 65548}], 2) = 65664
read(10, "H\r\341>H\16V5\330\261\215E~\310\7I\220\377b\317\362\226"..., 65536) 
= 65536
close(10)                               = 0
getuid()                                = 0
setreuid(65534, 4294967295)             = 0
utime("file.pdf", [2012/09/03-15:22:28, 2011/08/10-09:01:20]) = 0
setreuid(0, 4294967295)                 = 0
writev(5, [{"\200\0\0\0\200\0\0x+\re\270\0\0\0\0\0\0\0\2\0\5\363\330"..., 
128}], 1) = 128
lstat("file.zip", {st_mode=S_IFREG|0666, st_size=6853484, ...}) = 0
open("file.zip", O_RDONLY) = 10
fgetxattr(10, "system.posix_acl_access", 0x0, 0) = -1 ENODATA (No data 
available)
fgetxattr(10, "system.posix_acl_default", 0x0, 0) = -1 ENODATA (No data 
available)
flistxattr(10, (nil), 0)                = 17
flistxattr(10, 0x5d4210, 17)            = 17
fgetxattr(10, "netware.metadata", 0x0, 0) = 160
fgetxattr(10, "netware.metadata", 
"\xff\xfe\x01\x00\x00\x00\x00\x00\xef\x03\x00\x1a\x00\x00\x00\x00 
\x00\x00`\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xffS\x12\xd2O\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xc6\x10\xd2O\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xfa\x10\x9eP\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x80\xf5\x8cr\xe4\x8a\xdd\x11\xbd\xd1\x00\x11\x85e0\x0f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\x7f\xff\xff\xff\xff\x00\x00\x00",
 160) = 160
read(10, "PK\3\4\24\0\0\0\10\0z\201:@\357\366#\302P\222h\0=\232%"..., 65536) = 
65536
read(10, "PK\3\4\24\0\0\0\10\0z\201:@\357\366#\302P\222h\0=\232%"..., 65536) = 
65536
writev(5, [{"\200\0\0p*\re\270\0\0\0\0\0\0\0\2\0\5\363\330\0\0\0\315"..., 692}, 
{"\0\0\1\0\0\1\0\4\0\0\0\0PK\3\4\24\0\0\0\10\0z\201:@\357"..., 65548}], 2) = 66240
read(10, "\312\346\276L\27\326\376\24\214\34t=w{\35\345n\370z\264"..., 65536) = 
65536
writev(5, [{"\200\0\0\0\0\1\0x(\re\270\0\0\0\0\0\0\0\2\0\5\363\330\0"..., 116}, 
{"\0\0\1\0\0\1\0\4\0\0\0\0\312\346\276L\27\326\376\24\214"..., 65548}], 2) = 65664
read(10, "\357?\216SA\361\227=\307},\264m\feO(\247+\232Q\251\t\253"..., 65536) 
= 65536

But this is not progressing fast, about one 20 or 30 writev/read every 30 
seconds.....

Currently one of the hosts is backing up alone onto one tape and is really 
going fast.
While the others (about 10 savesets/sessions) are writing to another tape which 
is almost not progressing at all....

+----------------------------------------------------------------------
|This was sent by jm+backupcentral AT roth DOT lu via Backup Central.
|Forward SPAM to abuse AT backupcentral DOT com.
+----------------------------------------------------------------------



--

-- Yaron.