TLS variable relocation difference |
|||||||
Issue description
Version: R55-8830.0.0
OS: ChromeOS, downloaded from goldeneye
Arch: arm
Both GetThreadHeap() in thread_cache.h and CreateCacheIfNecessary() in thread_cache.cc
(in third_party/tcmalloc/chromium/src/) use the TLS variable threadlocal_heap_. But they seem to point to different variables.
In GetCache (which inlines GetThreadHeap), the variable is accessed starting from <+24> to <+34> below (the mrc instruction obtains the thread pointer):
Dump of assembler code for function _ZN8tcmalloc11ThreadCache8GetCacheEv:
0xab9f98c4 <+0>: strd r3, lr, [sp, #-8]!
0xab9f98c8 <+4>: ldr r3, [pc, #68] ; (0xab9f9910 <_ZN8tcmalloc11ThreadCache8GetCacheEv+76>)
=> 0xab9f98ca <+6>: add r3, pc
0xab9f98cc <+8>: ldr r3, [r3, #0]
0xab9f98ce <+10>: ldrb r0, [r3, #0]
0xab9f98d0 <+12>: cbz r0, 0xab9f98ee <_ZN8tcmalloc11ThreadCache8GetCacheEv+42>
0xab9f98d2 <+14>: ldr r1, [pc, #64] ; (0xab9f9914 <_ZN8tcmalloc11ThreadCache8GetCacheEv+80>)
0xab9f98d4 <+16>: add r1, pc
0xab9f98d6 <+18>: ldr r1, [r1, #0]
0xab9f98d8 <+20>: ldrb r2, [r1, #0]
0xab9f98da <+22>: cbz r2, 0xab9f98fc <_ZN8tcmalloc11ThreadCache8GetCacheEv+56>
0xab9f98dc <+24>: ldr r0, [pc, #44] ; (0xab9f990c <_ZN8tcmalloc11ThreadCache8GetCacheEv+72>)
0xab9f98de <+26>: mrc 15, 0, r12, cr13, cr0, {3}
0xab9f98e2 <+30>: add r0, pc
0xab9f98e4 <+32>: ldr r0, [r0, #0]
0xab9f98e6 <+34>: ldr.w r0, [r12, r0]
0xab9f98ea <+38>: cbz r0, 0xab9f98f2 <_ZN8tcmalloc11ThreadCache8GetCacheEv+46>
0xab9f98ec <+40>: pop {r3, pc}
0xab9f98ee <+42>: bl 0xab9f7eb8 <_ZN8tcmalloc11ThreadCache10InitModuleEv>
0xab9f98f2 <+46>: ldrd r3, lr, [sp]
0xab9f98f6 <+50>: add sp, #8
0xab9f98f8 <+52>: b.w 0xab9f7f9c <_ZN8tcmalloc11ThreadCache22CreateCacheIfNecessaryEv>
0xab9f98fc <+56>: ldr r3, [pc, #24] ; (0xab9f9918 <_ZN8tcmalloc11ThreadCache8GetCacheEv+84>)
0xab9f98fe <+58>: add r3, pc
0xab9f9900 <+60>: ldr r3, [r3, #0]
0xab9f9902 <+62>: ldr r0, [r3, #0]
0xab9f9904 <+64>: bl 0xab9f4b38 <_Z29perftools_pthread_getspecificj>
0xab9f9908 <+68>: b.n 0xab9f98ea <_ZN8tcmalloc11ThreadCache8GetCacheEv+38>
0xab9f990a <+70>: nop
0xab9f990c <+72>: str r6, [r5, #60] ; 0x3c
0xab9f990e <+74>: lsls r7, r4, #17
0xab9f9910 <+76>: str r2, [r1, #64] ; 0x40
0xab9f9912 <+78>: lsls r7, r4, #17
0xab9f9914 <+80>: str r4, [r0, #64] ; 0x40
0xab9f9916 <+82>: lsls r7, r4, #17
0xab9f9918 <+84>: str r6, [r3, #60] ; 0x3c
0xab9f991a <+86>: lsls r7, r4, #17
End of assembler dump.
<+24> loads:
(gdb) x/wx 0xab9f990c
0xab9f990c <_ZN8tcmalloc11ThreadCache8GetCacheEv+72>: 0x046763ee
<+30> and <+32> loads:
(gdb) x/wx (0x046763ee+0xab9f98e2+4)
0xb006fcd4: 0x00000018
However in the other function CreateCacheInfNecessary, the variable is accessed starting from <+110> to <+124> below:
Dump of assembler code for function _ZN8tcmalloc11ThreadCache22CreateCacheIfNecessaryEv:
0xab9f7f9c <+0>: strd r4, r5, [sp, #-16]!
0xab9f7fa0 <+4>: ldr r5, [pc, #136] ; (0xab9f802c <_ZN8tcmalloc11ThreadCache22CreateCacheIfNecessaryEv+144>)
0xab9f7fa2 <+6>: strd r7, lr, [sp, #8]
=> 0xab9f7fa6 <+10>: add r5, pc
0xab9f7fa8 <+12>: ldr r5, [r5, #0]
0xab9f7faa <+14>: add r7, sp, #0
0xab9f7fac <+16>: mov r0, r5
0xab9f7fae <+18>: warning: Could not find DWO CU obj/base/allocator/tcmalloc/low_level_alloc.dwo(0xd41c5365e59d55dc) referenced by CU at offset 0x414 [in module /usr/local/lib/debug/opt/google/chrome/chrome.debug]
bl 0xab9f14b0 <_ZN8SpinLock4LockEv>
0xab9f7fb2 <+22>: bl 0xaba451ec
0xab9f7fb6 <+26>: ldr r3, [pc, #120] ; (0xab9f8030 <_ZN8tcmalloc11ThreadCache22CreateCacheIfNecessaryEv+148>)
0xab9f7fb8 <+28>: add r3, pc
0xab9f7fba <+30>: ldr r4, [r3, #0]
0xab9f7fbc <+32>: cbnz r4, 0xab9f7fc4 <_ZN8tcmalloc11ThreadCache22CreateCacheIfNecessaryEv+40>
0xab9f7fbe <+34>: b.n 0xab9f7fec <_ZN8tcmalloc11ThreadCache22CreateCacheIfNecessaryEv+80>
0xab9f7fc0 <+36>: ldr r4, [r4, #0]
0xab9f7fc2 <+38>: cbz r4, 0xab9f7fec <_ZN8tcmalloc11ThreadCache22CreateCacheIfNecessaryEv+80>
0xab9f7fc4 <+40>: ldr.w r1, [r4, #688] ; 0x2b0
0xab9f7fc8 <+44>: cmp r1, r0
0xab9f7fca <+46>: bne.n 0xab9f7fc0 <_ZN8tcmalloc11ThreadCache22CreateCacheIfNecessaryEv+36>
0xab9f7fcc <+48>: mov r0, r5
0xab9f7fce <+50>: bl 0xab9f1530 <_ZN8SpinLock6UnlockEv>
0xab9f7fd2 <+54>: ldrb.w r5, [r4, #692] ; 0x2b4
0xab9f7fd6 <+58>: cbnz r5, 0xab9f7fe0 <_ZN8tcmalloc11ThreadCache22CreateCacheIfNecessaryEv+68>
0xab9f7fd8 <+60>: ldr r0, [pc, #88] ; (0xab9f8034 <_ZN8tcmalloc11ThreadCache22CreateCacheIfNecessaryEv+152>)
0xab9f7fda <+62>: add r0, pc
0xab9f7fdc <+64>: ldrb r2, [r0, #0]
0xab9f7fde <+66>: cbnz r2, 0xab9f7ff4 <_ZN8tcmalloc11ThreadCache22CreateCacheIfNecessaryEv+88>
0xab9f7fe0 <+68>: mov r0, r4
0xab9f7fe2 <+70>: mov sp, r7
0xab9f7fe4 <+72>: ldrd r4, r5, [sp]
0xab9f7fe8 <+76>: add sp, #8
0xab9f7fea <+78>: pop {r7, pc}
0xab9f7fec <+80>: bl 0xab9f7f2c <_ZN8tcmalloc11ThreadCache7NewHeapEm>
0xab9f7ff0 <+84>: mov r4, r0
0xab9f7ff2 <+86>: b.n 0xab9f7fcc <_ZN8tcmalloc11ThreadCache22CreateCacheIfNecessaryEv+48>
0xab9f7ff4 <+88>: ldr.w r12, [pc, #64] ; 0xab9f8038 <_ZN8tcmalloc11ThreadCache22CreateCacheIfNecessaryEv+156>
0xab9f7ff8 <+92>: movs r3, #1
0xab9f7ffa <+94>: mov r1, r4
0xab9f7ffc <+96>: strb.w r3, [r4, #692] ; 0x2b4
0xab9f8000 <+100>: add r12, pc
0xab9f8002 <+102>: ldr.w r0, [r12]
0xab9f8006 <+106>: bl 0xab9f4b6c <_Z29perftools_pthread_setspecificjPv>
0xab9f800a <+110>: ldr r1, [pc, #28] ; (0xab9f8028 <_ZN8tcmalloc11ThreadCache22CreateCacheIfNecessaryEv+140>)
0xab9f800c <+112>: mrc 15, 0, r0, cr13, cr0, {3}
0xab9f8010 <+116>: add r1, pc
0xab9f8012 <+118>: ldr r1, [r1, #0]
0xab9f8014 <+120>: strb.w r5, [r4, #692] ; 0x2b4
0xab9f8018 <+124>: str r4, [r0, r1]
0xab9f801a <+126>: mov r0, r4
0xab9f801c <+128>: mov sp, r7
0xab9f801e <+130>: ldrd r4, r5, [sp]
0xab9f8022 <+134>: add sp, #8
0xab9f8024 <+136>: pop {r7, pc}
0xab9f8026 <+138>: nop
0xab9f8028 <+140>: ldrb r0, [r5, #18]
0xab9f802a <+142>: lsls r7, r4, #17
0xab9f802c <+144>: ldrb r6, [r2, #19]
0xab9f802e <+146>: lsls r7, r4, #17
0xab9f8030 <+148>: ldr r3, [pc, #1008] ; (0xab9f8424 <_ZN12_GLOBAL__N_123InvalidGetAllocatedSizeEPKv+32>)
0xab9f8032 <+150>: lsls r4, r5, #17
0xab9f8034 <+152>: ldr r3, [pc, #856] ; (0xab9f8390 <_ZN22TCMallocImplementation25ReadHeapGrowthStackTracesEv+240>)
0xab9f8036 <+154>: lsls r4, r5, #17
0xab9f8038 <+156>: ldr r3, [pc, #608] ; (0xab9f829c <_ZN22TCMallocImplementation18GetSystemAllocatorEv+120>)
0xab9f803a <+158>: lsls r4, r5, #17
End of assembler dump.
(gdb) x/wx 0xab9f8028
0xab9f8028 <_ZN8tcmalloc11ThreadCache22CreateCacheIfNecessaryEv+140>: 0x04677ca8
(gdb) x/wx (0x04677ca8+0xab9f8010+4)
0xb006fcbc: 0x00000008
The thread data offset (0x8) is different from the previous one (0x18).
I also checked the thread pointer in both instances are the same:
(gdb) del
(gdb) b *0xab9f8010
(gdb) c
(gdb) i r
r0 0xf67ef4c0 4135515328
(gdb) del
(gdb) b *0xab9f98e2
(gdb) c
(gdb) i r
r12 0xf67ef4c0 4135515328
I checked the relocation for this binary (chrome):
$ readelf --wide -r chrome | grep TLS
04e807b0 00000111 R_ARM_TLS_DTPMOD32 00000010 .LANCHOR1
04e79cbc 00000213 R_ARM_TLS_TPOFF32 00000000 <null>
04e79cd4 0000ae13 R_ARM_TLS_TPOFF32 00000010 _ZN8tcmalloc11ThreadCache17threadlocal_heap_E
The second entry is strange as its name is <null>.
The second and third entry addresses also match the two instances above (0xb006fcd4 and 0xb006fcbc). So it seems this variable is given two relocation entries?
Strangely when I build chrome locally (using the simplechrome flow), the issue does not happen:
readelf --wide -r chrome | grep TLS
0658de14 00000111 R_ARM_TLS_DTPMOD32 062bbe58 .tdata
06587548 0000ac13 R_ARM_TLS_TPOFF32 00000010 _ZN8tcmalloc11ThreadCache17threadlocal_heap_E
,
Sep 26 2016
[Please clarify if I miss any details] Chih-Chung ran into a weird performance issue where the perf profiler indicates that additional CPU cycles are spent in the TCMalloc thread cache related routines. However, with Chrome built from the simplechrome flow the problem does not exist. Does anyone know the difference of the build process between the Chrome build in chroot vs simplechrome flow? Also, what's the reason to have the simplechrome flow?
,
Sep 26 2016
the main difference I know is that the simplechrome workflow is not optimized as aggresively as ebuild workflow. The ebuild workflow uses AFDO (ie: chrome is optimized using a previously captured profile). The simplechrome workflow was created for developers modifying Chrome for ChromeOS. These developers did not want to have to deal with portage/ebuilds etc. The simplechrome workflow is simpler for them. You can just build chrome and deploy it into an existing chromeos image. I have not fully looked at the details in this bug, but according to the wording, this does not seem like a "performance" issue and seems to point to a correctness issue? What is it? ALso, maybe the difference in theread offsets within the instruction is due to inlining differences? AFDO greatly affect inlining decisions.
,
Sep 26 2016
Yes, I think this is a correctness issue, but it happens to just cause performance problem instead of a crash, for example.
,
Sep 26 2016
There are a handful of configuration differences between the ebuild and SimpleChrome. Most of the modifications occur here: https://cs.chromium.org/chromium/src/third_party/chromite/cli/cros/cros_chrome_sdk.py?l=704 I would not recommend using SimpleChrome for debugging something like this, there is a reason why we use a chroot for building Chrome OS.
,
Sep 26 2016
I think this is related to https://bugs.chromium.org/p/chromium/issues/detail?id=633719 Please take a look. Does what you are seeing disappear if you remove attribute tls_model("initial-exec") on threadlocal_heap_ variable? There is a pending request for disable AFDO for all of tcmalloc. I will ping dpranke about this.
,
Sep 27 2016
On Tue, Sep 27, 2016 at 4:32 AM, llozano via monorail <
,
Sep 27 2016
re #6: I haven't tried to remove it, but I did try to change it to 'local-exec': https://codereview.chromium.org/2363283002/ And in that case, the issue did disappear in my test, but I don't know if I am just lucky. I would prefer not losing performance if we can actually fix AFDO or see if there is anything else we can do (maybe try 'local-exec'?)
,
Sep 27 2016
ok, dpranke provided a way to disable AFDO for tcmalloc. I will give it a try. https://codereview.chromium.org/2375783002/ .
,
Dec 26 2017
,
Jan 2 2018
[It appears that a bunch of old cros issues bulk-added the "Infra" component recently, but they should probably be "Infra>Client>ChromeOS".]
,
Jan 4 2018
,
Feb 5 2018
|
|||||||
►
Sign in to add a comment |
|||||||
Comment 1 by chihchung@chromium.org
, Sep 26 2016