add general fdpic support in dynamic linker and arch support for sh

at this point not all functionality is complete. the dynamic linker
itself, and main app if it is also loaded by the kernel, take
advantage of fdpic and do not need constant displacement between
segments, but additional libraries loaded by the dynamic linker follow
normal ELF semantics for mapping still. this fully works, but does not
admit shared text on nommu.

in terms of actual functional correctness, dlsym's results are
presently incorrect for function symbols, RTLD_NEXT fails to identify
the caller correctly, and dladdr fails almost entirely.

with the dynamic linker entry point working, support for static pie is
automatically included, but linking the main application as ET_DYN
(pie) probably does not make sense for fdpic anyway. ET_EXEC is
equally relocatable but more efficient at representing relocations.
diff --git a/src/ldso/dynlink.c b/src/ldso/dynlink.c
index 4b52a5a..4903dbd 100644
--- a/src/ldso/dynlink.c
+++ b/src/ldso/dynlink.c
@@ -42,7 +42,11 @@
 };
 
 struct dso {
+#if DL_FDPIC
+	struct fdpic_loadmap *loadmap;
+#else
 	unsigned char *base;
+#endif
 	char *name;
 	size_t *dynv;
 	struct dso *next, *prev;
@@ -75,6 +79,16 @@
 	struct td_index *td_index;
 	struct dso *fini_next;
 	char *shortname;
+#if DL_FDPIC
+	unsigned char *base;
+#else
+	struct fdpic_loadmap *loadmap;
+#endif
+	struct funcdesc {
+		void *addr;
+		size_t *got;
+	} *funcdescs;
+	size_t *got;
 	char buf[];
 };
 
@@ -112,6 +126,8 @@
 static size_t tls_cnt, tls_offset, tls_align = MIN_TLS_ALIGN;
 static size_t static_tls_cnt;
 static pthread_mutex_t init_fini_lock = { ._m_type = PTHREAD_MUTEX_RECURSIVE };
+static struct fdpic_loadmap *app_loadmap;
+static struct fdpic_dummy_loadmap app_dummy_loadmap;
 
 struct debug *_dl_debug_addr = &debug;
 
@@ -123,7 +139,20 @@
 #define strcmp(l,r) dl_strcmp(l,r)
 
 /* Compute load address for a virtual address in a given dso. */
+#ifdef DL_FDPIC
+static void *laddr(const struct dso *p, size_t v)
+{
+	size_t j=0;
+	if (!p->loadmap) return p->base + v;
+	for (j=0; v-p->loadmap->segs[j].p_vaddr >= p->loadmap->segs[j].p_memsz; j++);
+	return (void *)(v - p->loadmap->segs[j].p_vaddr + p->loadmap->segs[j].addr);
+}
+#define fpaddr(p, v) ((void (*)())&(struct funcdesc){ \
+	laddr(p, v), (p)->got })
+#else
 #define laddr(p, v) (void *)((p)->base + (v))
+#define fpaddr(p, v) ((void (*)())laddr(p, v))
+#endif
 
 static void decode_vec(size_t *v, size_t *a, size_t cnt)
 {
@@ -284,7 +313,7 @@
 	}
 
 	for (; rel_size; rel+=stride, rel_size-=stride*sizeof(size_t)) {
-		if (skip_relative && IS_RELATIVE(rel[1])) continue;
+		if (skip_relative && IS_RELATIVE(rel[1], dso->syms)) continue;
 		type = R_TYPE(rel[1]);
 		if (type == REL_NONE) continue;
 		sym_index = R_SYM(rel[1]);
@@ -293,7 +322,9 @@
 			sym = syms + sym_index;
 			name = strings + sym->st_name;
 			ctx = type==REL_COPY ? head->next : head;
-			def = find_sym(ctx, name, type==REL_PLT);
+			def = (sym->st_info&0xf) == STT_SECTION
+				? (struct symdef){ .dso = dso, .sym = sym }
+				: find_sym(ctx, name, type==REL_PLT);
 			if (!def.sym && (sym->st_shndx != SHN_UNDEF
 			    || sym->st_info>>4 != STB_WEAK)) {
 				error("Error relocating %s: %s: symbol not found",
@@ -349,6 +380,15 @@
 			*(uint32_t *)reloc_addr = sym_val + addend
 				- (size_t)reloc_addr;
 			break;
+		case REL_FUNCDESC:
+			*reloc_addr = def.sym ? (size_t)(def.dso->funcdescs
+				+ (def.sym - def.dso->syms)) : 0;
+			break;
+		case REL_FUNCDESC_VAL:
+			if ((sym->st_info&0xf) == STT_SECTION) *reloc_addr += sym_val;
+			else *reloc_addr = sym_val;
+			reloc_addr[1] = def.sym ? (size_t)def.dso->got : 0;
+			break;
 		case REL_DTPMOD:
 			*reloc_addr = def.dso->tls_id;
 			break;
@@ -430,6 +470,7 @@
 	Phdr *ph = dso->phdr;
 	size_t phcnt = dso->phnum;
 
+	if (DL_FDPIC) return; // FIXME
 	for (; phcnt--; ph=(void *)((char *)ph+dso->phentsize)) {
 		if (ph->p_type!=PT_LOAD) continue;
 		if ((ph->p_flags&(PF_R|PF_W))!=(PF_R|PF_W)) continue;
@@ -698,6 +739,8 @@
 		p->rpath_orig = p->strings + dyn[DT_RPATH];
 	if (dyn[0]&(1<<DT_RUNPATH))
 		p->rpath_orig = p->strings + dyn[DT_RUNPATH];
+	if (dyn[0]&(1<<DT_PLTGOT))
+		p->got = laddr(p, dyn[DT_PLTGOT]);
 	if (search_vec(p->dynv, dyn, DT_GNU_HASH))
 		p->ghashtab = laddr(p, *dyn);
 	if (search_vec(p->dynv, dyn, DT_VERSYM))
@@ -723,6 +766,46 @@
 	return nsym;
 }
 
+static void *dl_mmap(size_t n)
+{
+	void *p;
+	int prot = PROT_READ|PROT_WRITE, flags = MAP_ANONYMOUS|MAP_PRIVATE;
+#ifdef SYS_mmap2
+	p = (void *)__syscall(SYS_mmap2, 0, n, prot, flags, -1, 0);
+#else
+	p = (void *)__syscall(SYS_mmap, 0, n, prot, flags, -1, 0);
+#endif
+	return p == MAP_FAILED ? 0 : p;
+}
+
+static void makefuncdescs(struct dso *p)
+{
+	static int self_done;
+	size_t nsym = count_syms(p);
+	size_t i, size = nsym * sizeof(*p->funcdescs);
+
+	if (!self_done) {
+		p->funcdescs = dl_mmap(size);
+		self_done = 1;
+	} else {
+		p->funcdescs = malloc(size);
+	}
+	if (!p->funcdescs) {
+		if (!runtime) a_crash();
+		error("Error allocating function descriptors for %s", p->name);
+		longjmp(*rtld_fail, 1);
+	}
+	for (i=0; i<nsym; i++) {
+		if ((p->syms[i].st_info&0xf)==STT_FUNC && p->syms[i].st_shndx) {
+			p->funcdescs[i].addr = laddr(p, p->syms[i].st_value);
+			p->funcdescs[i].got = p->got;
+		} else {
+			p->funcdescs[i].addr = 0;
+			p->funcdescs[i].got = 0;
+		}
+	}
+}
+
 static struct dso *load_library(const char *name, struct dso *needed_by)
 {
 	char buf[2*NAME_MAX+2];
@@ -902,6 +985,8 @@
 	p->prev = tail;
 	tail = p;
 
+	if (DL_FDPIC) makefuncdescs(p);
+
 	if (ldd_mode) dprintf(1, "\t%s => %s (%p)\n", name, pathname, p->base);
 
 	return p;
@@ -1034,7 +1119,7 @@
 		}
 #ifndef NO_LEGACY_INITFINI
 		if ((dyn[0] & (1<<DT_FINI)) && dyn[DT_FINI])
-			((void (*)(void))laddr(p, dyn[DT_FINI]))();
+			fpaddr(p, dyn[DT_FINI])();
 #endif
 	}
 }
@@ -1057,7 +1142,7 @@
 		}
 #ifndef NO_LEGACY_INITFINI
 		if ((dyn[0] & (1<<DT_INIT)) && dyn[DT_INIT])
-			((void (*)(void))laddr(p, dyn[DT_INIT]))();
+			fpaddr(p, dyn[DT_INIT])();
 #endif
 		if (dyn[0] & (1<<DT_INIT_ARRAY)) {
 			size_t n = dyn[DT_INIT_ARRAYSZ]/sizeof(size_t);
@@ -1196,16 +1281,33 @@
 
 void __dls2(unsigned char *base, size_t *sp)
 {
-	Ehdr *ehdr = (void *)base;
-	ldso.base = base;
+	if (DL_FDPIC) {
+		void *p1 = (void *)sp[-2];
+		void *p2 = (void *)sp[-1];
+		if (!p1) {
+			size_t *auxv, aux[AUX_CNT];
+			for (auxv=sp+1+*sp+1; *auxv; auxv++); auxv++;
+			decode_vec(auxv, aux, AUX_CNT);
+			if (aux[AT_BASE]) ldso.base = (void *)aux[AT_BASE];
+			else ldso.base = (void *)(aux[AT_PHDR] & -4096);
+		}
+		app_loadmap = p2 ? p1 : 0;
+		ldso.loadmap = p2 ? p2 : p1;
+		ldso.base = laddr(&ldso, 0);
+	} else {
+		ldso.base = base;
+	}
+	Ehdr *ehdr = (void *)ldso.base;
 	ldso.name = ldso.shortname = "libc.so";
 	ldso.global = 1;
 	ldso.phnum = ehdr->e_phnum;
-	ldso.phdr = (void *)(base + ehdr->e_phoff);
+	ldso.phdr = laddr(&ldso, ehdr->e_phoff);
 	ldso.phentsize = ehdr->e_phentsize;
 	kernel_mapped_dso(&ldso);
 	decode_dyn(&ldso);
 
+	if (DL_FDPIC) makefuncdescs(&ldso);
+
 	/* Prepare storage for to save clobbered REL addends so they
 	 * can be reused in stage 3. There should be very few. If
 	 * something goes wrong and there are a huge number, abort
@@ -1217,7 +1319,7 @@
 	size_t symbolic_rel_cnt = 0;
 	apply_addends_to = rel;
 	for (; rel_size; rel+=2, rel_size-=2*sizeof(size_t))
-		if (!IS_RELATIVE(rel[1])) symbolic_rel_cnt++;
+		if (!IS_RELATIVE(rel[1], ldso.syms)) symbolic_rel_cnt++;
 	if (symbolic_rel_cnt >= ADDEND_LIMIT) a_crash();
 	size_t addends[symbolic_rel_cnt+1];
 	saved_addends = addends;
@@ -1231,7 +1333,8 @@
 	 * symbolically as a barrier against moving the address
 	 * load across the above relocation processing. */
 	struct symdef dls3_def = find_sym(&ldso, "__dls3", 0);
-	((stage3_func)laddr(&ldso, dls3_def.sym->st_value))(sp);
+	if (DL_FDPIC) ((stage3_func)&ldso.funcdescs[dls3_def.sym-ldso.syms])(sp);
+	else ((stage3_func)laddr(&ldso, dls3_def.sym->st_value))(sp);
 }
 
 /* Stage 3 of the dynamic linker is called with the dynamic linker/libc
@@ -1298,6 +1401,7 @@
 				app.tls_align = phdr->p_align;
 			}
 		}
+		if (DL_FDPIC) app.loadmap = app_loadmap;
 		if (app.tls_size) app.tls_image = laddr(&app, tls_image);
 		if (interp_off) ldso.name = laddr(&app, interp_off);
 		if ((aux[0] & (1UL<<AT_EXECFN))
@@ -1384,6 +1488,16 @@
 	}
 	app.global = 1;
 	decode_dyn(&app);
+	if (DL_FDPIC) {
+		makefuncdescs(&app);
+		if (!app.loadmap) {
+			app.loadmap = (void *)&app_dummy_loadmap;
+			app.loadmap->nsegs = 1;
+			app.loadmap->segs[0].addr = (size_t)app.base;
+			app.loadmap->segs[0].p_memsz = -1;
+		}
+		argv[-3] = (void *)app.loadmap;
+	}
 
 	/* Attach to vdso, if provided by the kernel */
 	if (search_vec(auxv, &vdso_base, AT_SYSINFO_EHDR)) {
@@ -1512,6 +1626,8 @@
 				free(p->td_index);
 				p->td_index = tmp;
 			}
+			if (p->funcdescs)
+				free(p->funcdescs);
 			if (p->rpath != p->rpath_orig)
 				free(p->rpath);
 			free(p->deps);