Subzero: Add Non-SFI support for x86-32.

The basic model is that each translated function begins with a special "GotVar = getIP" instruction, and each ConstantRelocatable reference is changed to GotVar+ConstantRelocatable@GOTOFF (assuming GotVar is legalized into a physical register).  The getIP instruction is late-lowered into:
  call __Sz_getIP_<reg>
  add <reg>, $_GLOBAL_OFFSET_TABLE_
  mov GotVar, <reg>
Note that _GLOBAL_OFFSET_TABLE_ gets a special relocation type.

The register allocator takes GotVar uses into account, giving appropriate weight toward register allocation.

If there are no uses of GotVar, the getIP instruction gets naturally dead-code eliminated.  Special treatment is needed to prevent this elimination when the only GotVar uses are for (floating point) constant pool values from Phi instructions, since the Phi lowering with its GotVar legalization happens after the main round of register allocation.

The x86 mem operand now has a IsPIC field to indicate whether it has been PIC-legalized.  Mem operands are sometimes legalized more than once, and this IsPIC field keeps GotVar from being added more than once.

We have to limit the aggressiveness of address mode inference, to make sure a register slot is left for the GotVar.

The Subzero runtime has new asm files to implement all possible __Sz_getIP_<reg> helpers.

The szbuild.py script and the spec2k version support Non-SFI builds.  Running spec2k depends on a pending change to the spec2k run_all.sh script.

Read-only data sections need to be named .data.rel.ro instead of .rodata because of PIC rules.

Most cross tests are working, but there is some problem with vector types that seems to be not Subzero related, so most vector tests are disabled for now.

Still to do:

* Fix "--nonsfi --filetype=iasm".  The llvm-mc assembler doesn't properly apply the _GLOBAL_OFFSET_TABLE_ relocation in iasm mode.  Maybe I can find a different syntactic trick that works, or use hybrid iasm for this limited case.

BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4327
R=jpp@chromium.org

Review URL: https://codereview.chromium.org/1506653002 .
diff --git a/Makefile.standalone b/Makefile.standalone
index 89e8d84..d84e928 100644
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -367,10 +367,15 @@
 $(OBJDIR)/unittest/AssemblerX8664: $(OBJDIR)/unittest
 	@mkdir -p $@
 
-RT_SRC := runtime/szrt.c runtime/szrt_ll.ll runtime/szrt_profiler.c
+RT_SRC := runtime/szrt.c runtime/szrt_ll.ll runtime/szrt_profiler.c \
+          runtime/szrt_asm_x8632.s runtime/szrt_asm_x8664.s \
+          runtime/szrt_asm_arm32.s
 RT_OBJ := build/runtime/szrt_native_x8632.o build/runtime/szrt_sb_x8632.o \
+          build/runtime/szrt_nonsfi_x8632.o \
           build/runtime/szrt_native_x8664.o build/runtime/szrt_sb_x8664.o \
-          build/runtime/szrt_native_arm32.o build/runtime/szrt_sb_arm32.o
+          build/runtime/szrt_nonsfi_x8664.o \
+          build/runtime/szrt_native_arm32.o build/runtime/szrt_sb_arm32.o \
+          build/runtime/szrt_nonsfi_arm32.o
 
 runtime: $(RT_OBJ)
 
@@ -400,10 +405,12 @@
           -i x8632,native,sse2 \
           -i x8632,native,sse4.1,test_vector_ops \
           -i x8632,sandbox,sse4.1,Om1 \
+          -i x8632,nonsfi,sse2,O2 -e x8632,nonsfi,test_select \
           -i x8664,native,sse2 \
           -i x8664,native,sse4.1,test_vector_ops \
           -e x8664,sandbox,sse4.1,Om1 \
           -i arm32,neon \
+          -e arm32,nonsfi \
           -e arm32,neon,test_vector_ops \
           -e arm32,neon,test_select
 	PNACL_BIN_PATH=$(PNACL_BIN_PATH) \
diff --git a/crosstest/crosstest.cfg b/crosstest/crosstest.cfg
index c92963c..afc75b4 100644
--- a/crosstest/crosstest.cfg
+++ b/crosstest/crosstest.cfg
@@ -52,9 +52,6 @@
 
 [test_sync_atomic]
 driver: test_sync_atomic_main.cpp
-# Compile the non-Subzero object files straight from source since the native
-# LLVM backend does not understand how to lower NaCl-specific intrinsics.
-flags: --crosstest-bitcode=0
 test: test_sync_atomic.cpp
 
 [test_vector_ops]
diff --git a/crosstest/test_arith_main.cpp b/crosstest/test_arith_main.cpp
index 8e0d6c0..db03709 100644
--- a/crosstest/test_arith_main.cpp
+++ b/crosstest/test_arith_main.cpp
@@ -162,7 +162,7 @@
 
 template <typename TypeUnsignedLabel, typename TypeSignedLabel>
 void testsVecInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
-#ifndef ARM32
+#if !defined(ARM32) && !defined(NONSFI)
   // TODO(jpp): remove this once vector support is implemented.
   typedef typename Vectors<TypeUnsignedLabel>::Ty TypeUnsigned;
   typedef typename Vectors<TypeSignedLabel>::Ty TypeSigned;
@@ -204,7 +204,7 @@
     for (size_t i = 0; i < MaxTestsPerFunc; ++i) {
       // Initialize the test vectors.
       TypeUnsigned Value1, Value2;
-      for (size_t j = 0; j < NumElementsInType;) {
+      for (size_t j = 0; j < NumElementsInType; ++j) {
         ElementTypeUnsigned Element1 = Values[Index() % NumValues];
         ElementTypeUnsigned Element2 = Values[Index() % NumValues];
         if (Funcs[f].ExcludeDivExceptions &&
@@ -214,7 +214,6 @@
           Element2 &= CHAR_BIT * sizeof(ElementTypeUnsigned) - 1;
         Value1[j] = Element1;
         Value2[j] = Element2;
-        ++j;
       }
       // Perform the test.
       TypeUnsigned ResultSz, ResultLlc;
@@ -240,7 +239,7 @@
       }
     }
   }
-#endif // ARM32
+#endif // !ARM32 && !NONSFI
 }
 
 template <typename Type>
@@ -316,7 +315,7 @@
 }
 
 void testsVecFp(size_t &TotalTests, size_t &Passes, size_t &Failures) {
-#ifndef ARM32
+#if !defined(ARM32) && !defined(NONSFI)
   // TODO(jpp): remove this once vector support is implemented.
   static const float NegInf = -1.0 / 0.0;
   static const float PosInf = 1.0 / 0.0;
@@ -376,7 +375,7 @@
       }
     }
   }
-#endif // ARM32
+#endif // !ARM32 && !NONSFI
 }
 
 #ifdef X8664_STACK_HACK
diff --git a/crosstest/test_global_main.cpp b/crosstest/test_global_main.cpp
index 5383cb7..6b4eec7 100644
--- a/crosstest/test_global_main.cpp
+++ b/crosstest/test_global_main.cpp
@@ -39,6 +39,11 @@
 double ExternName5 = 3.44e26;
 
 int main(int argc, char **argv) {
+  // Prevent pnacl-opt from deleting "unused" globals.
+  if (argc < 0) {
+    std::cout << &ExternName1 << &ExternName2 << &ExternName3 << &SimpleData
+              << &ExternName4 << ExternName5;
+  }
   size_t TotalTests = 0;
   size_t Passes = 0;
   size_t Failures = 0;
diff --git a/crosstest/test_icmp_main.cpp b/crosstest/test_icmp_main.cpp
index 401520d..4378a6d 100644
--- a/crosstest/test_icmp_main.cpp
+++ b/crosstest/test_icmp_main.cpp
@@ -197,7 +197,7 @@
 
 template <typename TypeUnsignedLabel, typename TypeSignedLabel>
 void testsVecInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
-#ifndef ARM32
+#if !defined(ARM32) && !defined(NONSFI)
   // TODO(jpp): remove this once vector support is implemented.
   typedef typename Vectors<TypeUnsignedLabel>::Ty TypeUnsigned;
   typedef typename Vectors<TypeSignedLabel>::Ty TypeSigned;
@@ -255,7 +255,7 @@
       }
     }
   }
-#endif // ARM32
+#endif // !ARM32 && !NONSFI
 }
 
 // Return true on wraparound
@@ -274,7 +274,7 @@
 
 template <typename T>
 void testsVecI1(size_t &TotalTests, size_t &Passes, size_t &Failures) {
-#ifndef ARM32
+#if !defined(ARM32) && !defined(NONSFI)
   // TODO(jpp): remove this once vector support is implemented.
   typedef typename Vectors<T>::Ty Ty;
   typedef Ty (*FuncType)(Ty, Ty);
@@ -343,7 +343,7 @@
       }
     }
   }
-#endif // ARM32
+#endif // !ARM32 && !NONSFI
 }
 
 #ifdef X8664_STACK_HACK
diff --git a/pydir/build-runtime.py b/pydir/build-runtime.py
index ad38a2e..c735e85 100755
--- a/pydir/build-runtime.py
+++ b/pydir/build-runtime.py
@@ -11,63 +11,71 @@
 
 
 def Translate(ll_files, extra_args, obj, verbose):
-    """Translate a set of input bitcode files into a single object file.
+  """Translate a set of input bitcode files into a single object file.
 
-    Use pnacl-llc to translate textual bitcode input ll_files into object file
-    obj, using extra_args as the architectural flags.
-    """
-    shellcmd(['cat'] + ll_files + ['|',
-              'pnacl-llc',
-              '-externalize',
-              '-function-sections',
-              '-O2',
-              '-filetype=obj',
-              '-bitcode-format=llvm',
-              '-o', obj
-          ] + extra_args, echo=verbose)
-    shellcmd(['le32-nacl-objcopy',
-              '--strip-symbol=nacl_tp_tdb_offset',
-              '--strip-symbol=nacl_tp_tls_offset',
-              obj
-        ], echo=verbose)
+  Use pnacl-llc to translate textual bitcode input ll_files into object file
+  obj, using extra_args as the architectural flags.
+  """
+  shellcmd(['cat'] + ll_files + ['|',
+            'pnacl-llc',
+            '-externalize',
+            '-function-sections',
+            '-O2',
+            '-filetype=obj',
+            '-bitcode-format=llvm',
+            '-o', obj
+    ] + extra_args, echo=verbose)
+  shellcmd(['le32-nacl-objcopy',
+            '--strip-symbol=nacl_tp_tdb_offset',
+            '--strip-symbol=nacl_tp_tls_offset',
+            obj
+    ], echo=verbose)
 
 
 def PartialLink(obj_files, extra_args, lib, verbose):
-    """Partially links a set of obj files into a final obj library."""
-    shellcmd(['le32-nacl-ld',
-              '-o', lib,
-              '-r',
-        ] + extra_args + obj_files, echo=verbose)
+  """Partially links a set of obj files into a final obj library."""
+  shellcmd(['le32-nacl-ld',
+            '-o', lib,
+            '-r',
+    ] + extra_args + obj_files, echo=verbose)
 
 
 def MakeRuntimesForTarget(target_info, ll_files,
                           srcdir, tempdir, rtdir, verbose):
-    def TmpFile(template):
-        return template.format(dir=tempdir, target=target_info.target)
-    def OutFile(template):
-        return template.format(rtdir=rtdir, target=target_info.target)
+  """Builds native, sandboxed, and nonsfi runtimes for the given target."""
+  # File-mangling helper functions.
+  def TmpFile(template):
+    return template.format(dir=tempdir, target=target_info.target)
+  def OutFile(template):
+    return template.format(rtdir=rtdir, target=target_info.target)
+  # Helper function for building the native unsandboxed runtime.
+  def MakeNativeRuntime():
+    """Builds just the native runtime."""
     # Translate tempdir/szrt.ll and tempdir/szrt_ll.ll to
     # szrt_native_{target}.tmp.o.
     Translate(ll_files,
               ['-mtriple=' + target_info.triple] + target_info.llc_flags,
               TmpFile('{dir}/szrt_native_{target}.tmp.o'),
               verbose)
-    # Compile srcdir/szrt_profiler.c to tempdir/szrt_profiler_native_{target}.o
+    # Compile srcdir/szrt_profiler.c to
+    # tempdir/szrt_profiler_native_{target}.o.
     shellcmd(['clang',
               '-O2',
               '-target=' + target_info.triple,
               '-c',
               '{srcdir}/szrt_profiler.c'.format(srcdir=srcdir),
               '-o', TmpFile('{dir}/szrt_profiler_native_{target}.o')
-    ], echo=verbose)
-    # Writing full szrt_native_{target}.o.
+      ], echo=verbose)
+    # Write full szrt_native_{target}.o.
     PartialLink([TmpFile('{dir}/szrt_native_{target}.tmp.o'),
                  TmpFile('{dir}/szrt_profiler_native_{target}.o')],
                 ['-m {ld_emu}'.format(ld_emu=target_info.ld_emu)],
                 OutFile('{rtdir}/szrt_native_{target}.o'),
                 verbose)
-
-    # Translate tempdir/szrt.ll and tempdir/szrt_ll.ll to szrt_sb_{target}.o
+  # Helper function for building the sandboxed runtime.
+  def MakeSandboxedRuntime():
+    """Builds just the sandboxed runtime."""
+    # Translate tempdir/szrt.ll and tempdir/szrt_ll.ll to szrt_sb_{target}.o.
     # The sandboxed library does not get the profiler helper function as the
     # binaries are linked with -nostdlib.
     Translate(ll_files,
@@ -75,11 +83,41 @@
               target_info.llc_flags,
               OutFile('{rtdir}/szrt_sb_{target}.o'),
               verbose)
+  # Helper function for building the Non-SFI runtime.
+  def MakeNonsfiRuntime():
+    """Builds just the nonsfi runtime."""
+    # Translate tempdir/szrt.ll and tempdir/szrt_ll.ll to
+    # szrt_nonsfi_{target}.tmp.o.
+    Translate(ll_files,
+              ['-mtriple=' + target_info.triple] + target_info.llc_flags +
+              ['-relocation-model=pic', '-force-tls-non-pic', '-malign-double'],
+              TmpFile('{dir}/szrt_nonsfi_{target}.tmp.o'),
+              verbose)
+    # Assemble srcdir/szrt_asm_{target}.s to tempdir/szrt_asm_{target}.o.
+    shellcmd(['llvm-mc',
+              '-triple=' + target_info.triple,
+              '-filetype=obj',
+              '-o', TmpFile('{dir}/szrt_asm_{target}.o'),
+              '{srcdir}/szrt_asm_{target}.s'.format(
+                srcdir=srcdir, target=target_info.target)
+      ], echo=verbose)
+    # Write full szrt_nonsfi_{target}.o.
+    PartialLink([TmpFile('{dir}/szrt_nonsfi_{target}.tmp.o'),
+                 TmpFile('{dir}/szrt_asm_{target}.o')],
+                ['-m {ld_emu}'.format(ld_emu=target_info.ld_emu)],
+                OutFile('{rtdir}/szrt_nonsfi_{target}.o'),
+                verbose)
+
+  # Run the helper functions.
+  MakeNativeRuntime()
+  MakeSandboxedRuntime()
+  MakeNonsfiRuntime()
 
 
 def main():
     """Build the Subzero runtime support library for all architectures.
     """
+    nacl_root = FindBaseNaCl()
     argparser = argparse.ArgumentParser(
         description='    ' + main.__doc__,
         formatter_class=argparse.RawTextHelpFormatter)
@@ -87,9 +125,11 @@
                            action='store_true',
                            help='Display some extra debugging output')
     argparser.add_argument('--pnacl-root', dest='pnacl_root',
+                           default=(
+                             '{root}/toolchain/linux_x86/pnacl_newlib_raw'
+                           ).format(root=nacl_root),
                            help='Path to PNaCl toolchain binaries.')
     args = argparser.parse_args()
-    nacl_root = FindBaseNaCl()
     os.environ['PATH'] = ('{root}/bin{sep}{path}'
         ).format(root=args.pnacl_root, sep=os.pathsep, path=os.environ['PATH'])
     srcdir = (
diff --git a/pydir/crosstest.py b/pydir/crosstest.py
index bb88f2d..50946ca 100755
--- a/pydir/crosstest.py
+++ b/pydir/crosstest.py
@@ -7,8 +7,8 @@
 import tempfile
 
 import targets
-from utils import shellcmd
-from utils import FindBaseNaCl
+from szbuild import LinkNonsfi
+from utils import shellcmd, FindBaseNaCl, get_sfi_string
 
 def main():
     """Builds a cross-test binary for comparing Subzero and llc translation.
@@ -74,6 +74,9 @@
     argparser.add_argument('--sandbox', required=False, default=0, type=int,
                            dest='sandbox',
                            help='Use sandboxing. Default "%(default)s".')
+    argparser.add_argument('--nonsfi', required=False, default=0, type=int,
+                           dest='nonsfi',
+                           help='Use Non-SFI mode. Default "%(default)s".')
     argparser.add_argument('--prefix', required=True,
                            metavar='SZ_PREFIX',
                            help='String prepended to Subzero symbol names')
@@ -84,12 +87,6 @@
                            metavar='OUTPUT_DIR',
                            help='Output directory for all files.' +
                                 ' Default "%(default)s".')
-    argparser.add_argument('--crosstest-bitcode', required=False,
-                           default=1, type=int,
-                           help='Compile non-subzero crosstest object file ' +
-                           'from the same bitcode as the subzero object. ' +
-                           'If 0, then compile it straight from source.' +
-                           ' Default %(default)d.')
     argparser.add_argument('--filetype', default='obj', dest='filetype',
                            choices=['obj', 'asm', 'iasm'],
                            help='Output file type.  Default %(default)s.')
@@ -105,15 +102,21 @@
     if args.sandbox:
         triple = targets.ConvertTripleToNaCl(triple)
     llc_flags = target_info.llc_flags + arch_llc_flags_extra[args.target]
+    if args.nonsfi:
+        llc_flags.extend(['-relocation-model=pic',
+                          '-malign-double',
+                          '-force-tls-non-pic',
+                          '-mtls-use-call'])
     mypath = os.path.abspath(os.path.dirname(sys.argv[0]))
 
+    # Construct a "unique key" for each test so that tests can be run in
+    # parallel without race conditions on temporary file creation.
+    key = '{sb}.O{opt}.{attr}.{target}'.format(
+        target=args.target,
+        sb=get_sfi_string(args, 'sb', 'nonsfi', 'nat'),
+        opt=args.optlevel, attr=args.attr)
     objs = []
     for arg in args.test:
-        # Construct a "unique key" for each test so that tests can be run in
-        # parallel without race conditions on temporary file creation.
-        key = '{target}.{sb}.O{opt}.{attr}'.format(
-            target=args.target, sb='sb' if args.sandbox else 'nat',
-            opt=args.optlevel, attr=args.attr)
         base, ext = os.path.splitext(arg)
         if ext == '.ll':
             bitcode = arg
@@ -123,12 +126,16 @@
             bitcode = os.path.join(args.dir, base + '.' + key + '.pnacl.ll')
             shellcmd(['{bin}/pnacl-clang'.format(bin=bindir),
                       ('-O2' if args.clang_opt else '-O0'),
+                      get_sfi_string(args, '', '-DNONSFI', ''),
                       ('-DARM32' if args.target == 'arm32' else ''), '-c', arg,
                       '-o', bitcode_nonfinal])
             shellcmd(['{bin}/pnacl-opt'.format(bin=bindir),
                       '-pnacl-abi-simplify-preopt',
                       '-pnacl-abi-simplify-postopt',
                       '-pnaclabi-allow-debug-metadata',
+                      '-strip-metadata',
+                      '-strip-module-flags',
+                      '-strip-debug',
                       bitcode_nonfinal, '-S', '-o', bitcode])
 
         base_sz = '{base}.{key}'.format(base=base, key=key)
@@ -142,6 +149,7 @@
                   '-mattr=' + args.attr,
                   '--target=' + args.target,
                   '--sandbox=' + str(args.sandbox),
+                  '--nonsfi=' + str(args.nonsfi),
                   '--prefix=' + args.prefix,
                   '-allow-uninitialized-globals',
                   '-externalize',
@@ -164,25 +172,32 @@
         # linked into the executable, but when PNaCl supports shared nexe
         # libraries, this would need to change.
         shellcmd(['{bin}/le32-nacl-objcopy'.format(bin=bindir),
-                  '--weaken-symbol=__Sz_block_profile_info', obj_sz])
+                  '--weaken-symbol=__Sz_block_profile_info',
+                  '--strip-symbol=nacl_tp_tdb_offset',
+                  '--strip-symbol=nacl_tp_tls_offset',
+                  obj_sz])
         objs.append(obj_sz)
-        if args.crosstest_bitcode:
-            shellcmd(['{bin}/pnacl-llc'.format(bin=bindir),
-                      '-mtriple=' + triple,
-                      '-externalize',
-                      '-filetype=obj',
-                      '-o=' + obj_llc,
-                      bitcode] + llc_flags)
-            objs.append(obj_llc)
-        else:
-            objs.append(arg)
+        shellcmd(['{bin}/pnacl-llc'.format(bin=bindir),
+                  '-mtriple=' + triple,
+                  '-externalize',
+                  '-filetype=obj',
+                  '-bitcode-format=llvm',
+                  '-o=' + obj_llc,
+                  bitcode] + llc_flags)
+        shellcmd(['{bin}/le32-nacl-objcopy'.format(bin=bindir),
+                  '--weaken-symbol=__Sz_block_profile_info',
+                  '--strip-symbol=nacl_tp_tdb_offset',
+                  '--strip-symbol=nacl_tp_tls_offset',
+                  obj_llc])
+        objs.append(obj_llc)
 
     # Add szrt_sb_${target}.o or szrt_native_${target}.o.
-    objs.append((
-            '{root}/toolchain_build/src/subzero/build/runtime/' +
-            'szrt_{sb}_' + args.target + '.o'
-            ).format(root=nacl_root, sb='sb' if args.sandbox else 'native'))
-    pure_c = os.path.splitext(args.driver)[1] == '.c'
+    if not args.nonsfi:
+        objs.append((
+                '{root}/toolchain_build/src/subzero/build/runtime/' +
+                'szrt_{sb}_' + args.target + '.o'
+                ).format(root=nacl_root,
+                         sb=get_sfi_string(args, 'sb', 'nonsfi', 'native')))
 
     # TODO(jpp): clean up stack hack related code.
     needs_stack_hack = False
@@ -196,21 +211,71 @@
     if args.target == 'arm32':
       target_params.append('-DARM32')
       target_params.append('-static')
+    if args.nonsfi:
+      target_params.append('-DNONSFI')
 
-    # Set compiler to clang, clang++, pnacl-clang, or pnacl-clang++.
+    pure_c = os.path.splitext(args.driver)[1] == '.c'
+    if not args.nonsfi:
+        # Set compiler to clang, clang++, pnacl-clang, or pnacl-clang++.
+        compiler = '{bin}/{prefix}{cc}'.format(
+            bin=bindir, prefix=get_sfi_string(args, 'pnacl-', '', ''),
+            cc='clang' if pure_c else 'clang++')
+        sb_native_args = (['-O0', '--pnacl-allow-native',
+                           '-arch', target_info.compiler_arch,
+                           '-Wn,-defsym=__Sz_AbsoluteZero=0']
+                          if args.sandbox else
+                          ['-g', '-target=' + triple,
+                           '-lm', '-lpthread',
+                           '-Wl,--defsym=__Sz_AbsoluteZero=0'] +
+                          target_info.cross_headers)
+        shellcmd([compiler] + target_params + [args.driver] + objs +
+                 ['-o', os.path.join(args.dir, args.output)] + sb_native_args)
+        return 0
+
+    base, ext = os.path.splitext(args.driver)
+    bitcode_nonfinal = os.path.join(args.dir, base + '.' + key + '.bc')
+    bitcode = os.path.join(args.dir, base + '.' + key + '.pnacl.ll')
+    asm_sz = os.path.join(args.dir, base + '.' + key + '.s')
+    obj_llc = os.path.join(args.dir, base + '.' + key + '.o')
     compiler = '{bin}/{prefix}{cc}'.format(
-        bin=bindir, prefix='pnacl-' if args.sandbox else '',
+        bin=bindir, prefix='pnacl-',
         cc='clang' if pure_c else 'clang++')
-    sb_native_args = (['-O0', '--pnacl-allow-native',
-                       '-arch', target_info.compiler_arch,
-                       '-Wn,-defsym=__Sz_AbsoluteZero=0']
-                      if args.sandbox else
-                      ['-g', '-target=' + triple,
-                       '-lm', '-lpthread',
-                       '-Wl,--defsym=__Sz_AbsoluteZero=0'] +
-                      target_info.cross_headers)
-    shellcmd([compiler] + target_params + [args.driver] + objs +
-             ['-o', os.path.join(args.dir, args.output)] + sb_native_args)
+    shellcmd([compiler,
+              args.driver,
+              '-DNONSFI' if args.nonsfi else '',
+              '-O2',
+              '-o', bitcode_nonfinal,
+              '-Wl,-r'
+             ])
+    shellcmd(['{bin}/pnacl-opt'.format(bin=bindir),
+              '-pnacl-abi-simplify-preopt',
+              '-pnacl-abi-simplify-postopt',
+              '-pnaclabi-allow-debug-metadata',
+              '-strip-metadata',
+              '-strip-module-flags',
+              '-strip-debug',
+              '-disable-opt',
+              bitcode_nonfinal, '-S', '-o', bitcode])
+    shellcmd(['{bin}/pnacl-llc'.format(bin=bindir),
+              '-mtriple=' + triple,
+              '-externalize',
+              '-filetype=obj',
+              '-O2',
+              '-bitcode-format=llvm',
+              '-o', obj_llc,
+              bitcode] + llc_flags)
+    if not args.sandbox and not args.nonsfi:
+        shellcmd(['{bin}/le32-nacl-objcopy'.format(bin=bindir),
+                  '--redefine-sym', '_start=_user_start',
+                  obj_llc
+                 ])
+    objs.append(obj_llc)
+    if args.nonsfi:
+        LinkNonsfi(objs, os.path.join(args.dir, args.output), args.target)
+    elif args.sandbox:
+        LinkSandbox(objs, os.path.join(args.dir, args.output), args.target)
+    else:
+        LinkNative(objs, os.path.join(args.dir, args.output), args.target)
 
 if __name__ == '__main__':
     main()
diff --git a/pydir/crosstest_generator.py b/pydir/crosstest_generator.py
index ef32a08..31a1cf7 100755
--- a/pydir/crosstest_generator.py
+++ b/pydir/crosstest_generator.py
@@ -56,7 +56,7 @@
 
   # The rest of the attribute sets.
   targets = [ 'x8632', 'x8664', 'arm32' ]
-  sandboxing = [ 'native', 'sandbox' ]
+  sandboxing = [ 'native', 'sandbox', 'nonsfi' ]
   opt_levels = [ 'Om1', 'O2' ]
   arch_attrs = { 'x8632': [ 'sse2', 'sse4.1' ],
                  'x8664': [ 'sse2', 'sse4.1' ],
@@ -103,7 +103,10 @@
   argparser.add_argument('--lit', default=False, action='store_true',
                          help='Generate files for lit testing')
   argparser.add_argument('--toolchain-root', dest='toolchain_root',
-                           help='Path to toolchain binaries.')
+                         default=(
+                           '{root}/toolchain/linux_x86/pnacl_newlib_raw/bin'
+                         ).format(root=root),
+                         help='Path to toolchain binaries.')
   args = argparser.parse_args()
 
   # Run from the crosstest directory to make it easy to grab inputs.
@@ -156,6 +159,7 @@
                  '--mattr={attr}'.format(attr=attr),
                  '--prefix=Subzero_',
                  '--target={target}'.format(target=target),
+                 '--nonsfi={nsfi}'.format(nsfi='1' if sb=='nonsfi' else '0'),
                  '--sandbox={sb}'.format(sb='1' if sb=='sandbox' else '0'),
                  '--dir={dir}'.format(dir=args.dir),
                  '--output={exe}'.format(exe=exe),
@@ -169,6 +173,9 @@
               run_cmd = run_cmd_base
               if sb == 'sandbox':
                 run_cmd = '{root}/run.py -q '.format(root=root) + run_cmd
+              elif sb == 'nonsfi':
+                run_cmd = ('{root}/scons-out/opt-linux-x86-32/obj/src/nonsfi/' +
+                           'loader/nonsfi_loader ').format(root=root) + run_cmd
               else:
                 run_cmd = RunNativePrefix(args.toolchain_root, target, run_cmd)
               if args.lit:
diff --git a/pydir/szbuild.py b/pydir/szbuild.py
index 73633a4..95a6a2c 100755
--- a/pydir/szbuild.py
+++ b/pydir/szbuild.py
@@ -6,8 +6,7 @@
 import re
 import sys
 
-from utils import shellcmd
-from utils import FindBaseNaCl
+from utils import shellcmd, FindBaseNaCl, get_sfi_string
 
 def NewerThanOrNotThere(old_path, new_path):
     """Returns whether old_path is newer than new_path.
@@ -85,6 +84,8 @@
                            help='Output file type.  Default %(default)s.')
     argparser.add_argument('--sandbox', dest='sandbox', action='store_true',
                            help='Enable sandboxing in the translator')
+    argparser.add_argument('--nonsfi', dest='nonsfi', action='store_true',
+                           help='Enable Non-SFI in the translator')
     argparser.add_argument('--enable-block-profile',
                            dest='enable_block_profile', action='store_true',
                            help='Enable basic block profiling.')
@@ -101,6 +102,116 @@
     argparser.add_argument('--no-sz', dest='nosz', action='store_true',
                            help='Run only post-Subzero build steps')
 
+def LinkSandbox(objs, exe, target, verbose=True):
+    assert target in ('x8632', 'arm32'), \
+        '-sandbox is not available for %s' % target
+    nacl_root = FindBaseNaCl()
+    gold = ('{root}/toolchain/linux_x86/pnacl_newlib_raw/bin/' +
+            'le32-nacl-ld.gold').format(root=nacl_root)
+    target_lib_dir = {
+      'arm32': 'arm',
+      'x8632': 'x86-32',
+    }[target]
+    linklib = ('{root}/toolchain/linux_x86/pnacl_newlib_raw/translator/' +
+               '{target_dir}/lib').format(root=nacl_root,
+                                          target_dir=target_lib_dir)
+    shellcmd([gold,
+              '-nostdlib',
+              '--no-fix-cortex-a8',
+              '--eh-frame-hdr',
+              '-z', 'text',
+              #'-z', 'noexecstack',
+              '--build-id',
+              '--entry=__pnacl_start',
+              '-static', #'-pie',
+              '{linklib}/crtbegin.o'.format(linklib=linklib)] +
+             objs +
+             [('{root}/toolchain_build/src/subzero/build/runtime/' +
+               'szrt_sb_{target}.o').format(root=nacl_root, target=target),
+              '{linklib}/libpnacl_irt_shim_dummy.a'.format(linklib=linklib),
+              '--start-group',
+              '{linklib}/libgcc.a'.format(linklib=linklib),
+              '{linklib}/libcrt_platform.a'.format(linklib=linklib),
+              '--end-group',
+              '{linklib}/crtend.o'.format(linklib=linklib),
+              '--undefined=_start',
+              '--defsym=__Sz_AbsoluteZero=0',
+              #'--defsym=_begin=0',
+              '-o', exe
+             ], echo=verbose)
+
+def LinkNonsfi(objs, exe, target, verbose=True):
+    nacl_root = FindBaseNaCl()
+    gold = ('{root}/toolchain/linux_x86/pnacl_newlib_raw/bin/' +
+            'le32-nacl-ld.gold').format(root=nacl_root)
+    target_lib_dir = {
+      'arm32': 'arm-nonsfi',
+      'x8632': 'x86-32-nonsfi',
+    }[target]
+    linklib = ('{root}/toolchain/linux_x86/pnacl_newlib_raw/translator/' +
+               '{target_dir}/lib').format(root=nacl_root,
+                                          target_dir=target_lib_dir)
+    shellcmd([gold,
+              '-nostdlib',
+              '--no-fix-cortex-a8',
+              '--eh-frame-hdr',
+              '-z', 'text',
+              '-z', 'noexecstack',
+              '--build-id',
+              '--entry=__pnacl_start',
+              '-pie',
+              '{linklib}/crtbegin.o'.format(linklib=linklib)] +
+             objs +
+             [('{root}/toolchain_build/src/subzero/build/runtime/' +
+               'szrt_nonsfi_{target}.o').format(root=nacl_root, target=target),
+              '{linklib}/libpnacl_irt_shim_dummy.a'.format(linklib=linklib),
+              '--start-group',
+              '{linklib}/libgcc.a'.format(linklib=linklib),
+              '{linklib}/libcrt_platform.a'.format(linklib=linklib),
+              '--end-group',
+              '{linklib}/crtend.o'.format(linklib=linklib),
+              '--undefined=_start',
+              '--defsym=__Sz_AbsoluteZero=0',
+              '--defsym=_begin=0',
+              '-o', exe
+             ], echo=verbose)
+
+def LinkNative(objs, exe, target, verbose=True):
+    nacl_root = FindBaseNaCl()
+    linker = {
+      'arm32': '/usr/bin/arm-linux-gnueabihf-g++',
+      'x8632': ('{root}/../third_party/llvm-build/Release+Asserts/bin/clang'
+               ).format(root=nacl_root),
+      'x8664': ('{root}/../third_party/llvm-build/Release+Asserts/bin/clang'
+               ).format(root=nacl_root)
+    }[target]
+
+    extra_linker_args = {
+      'arm32': ['-mcpu=cortex-a9'],
+      'x8632': ['-m32'],
+      'x8664': ['-mx32']
+    }[target]
+
+    lib_dir = {
+      'arm32': 'arm-linux',
+      'x8632': 'x86-32-linux',
+      'x8664': 'x86-64-linux',
+    }[target]
+
+    shellcmd([linker] +
+             extra_linker_args +
+             objs +
+             ['-o', exe,
+              ('{root}/toolchain/linux_x86/pnacl_newlib_raw/translator/' +
+               '{lib_dir}/lib/' +
+               '{{unsandboxed_irt,irt_random,irt_query_list}}.o').format(
+                   root=nacl_root, lib_dir=lib_dir),
+              ('{root}/toolchain_build/src/subzero/build/runtime/' +
+               'szrt_native_{target}.o').format(root=nacl_root, target=target),
+              '-lm', '-lpthread', '-lrt',
+              '-Wl,--defsym=__Sz_AbsoluteZero=0'
+             ], echo=verbose)
+
 def main():
     """Create a hybrid translation from Subzero and llc.
 
@@ -183,14 +294,15 @@
     opt_level = args.optlevel
     opt_level_map = { 'm1':'0', '-1':'0', '0':'0', '1':'1', '2':'2' }
     hybrid = args.include or args.exclude
+    native = not args.sandbox and not args.nonsfi
 
     if hybrid and (args.force or
                    NewerThanOrNotThere(pexe, obj_llc) or
                    NewerThanOrNotThere(llcbin, obj_llc)):
         arch = {
-          'arm32': 'armv7' if args.sandbox else 'arm-nonsfi',
-          'x8632': 'x86-32' if args.sandbox else 'x86-32-linux',
-          'x8664': 'x86-64' if args.sandbox else 'x86-64-linux',
+          'arm32': 'arm' + get_sfi_string(args, 'v7', '-nonsfi', '-nonsfi'),
+          'x8632': 'x86-32' + get_sfi_string(args, '', '-nonsfi', '-linux'),
+          'x8664': 'x86-64' + get_sfi_string(args, '', '', '-linux')
         }[args.target]
 
         # Only run pnacl-translate in hybrid mode.
@@ -207,7 +319,7 @@
                  args.llc_args +
                  [pexe],
                  echo=args.verbose)
-        if not args.sandbox:
+        if native:
             shellcmd((
                 '{objcopy} --redefine-sym _start=_user_start {obj}'
                 ).format(objcopy=objcopy, obj=obj_llc), echo=args.verbose)
@@ -231,6 +343,7 @@
                        '-ffunction-sections',
                        '-fdata-sections'] if hybrid else []) +
                      (['-sandbox'] if args.sandbox else []) +
+                     (['-nonsfi'] if args.nonsfi else []) +
                      (['-enable-block-profile'] if
                           args.enable_block_profile and not args.sandbox
                           else []) +
@@ -239,9 +352,11 @@
                      echo=args.verbose)
         if args.filetype != 'obj':
             triple = {
-              'arm32': 'arm-nacl' if args.sandbox else 'arm',
-              'x8632': 'i686-nacl' if args.sandbox else 'i686',
-              'x8664': 'x86_64-nacl' if args.sandbox else 'x86_64-linux-gnux32',
+              'arm32': 'arm' + get_sfi_string(args, '-nacl', '', ''),
+              'x8632': 'i686' + get_sfi_string(args, '-nacl', '', ''),
+              'x8664': 'x86_64' +
+                        get_sfi_string(args, '-nacl', '-linux-gnux32',
+                                       '-linux-gnux32'),
             }[args.target]
 
             shellcmd((
@@ -249,7 +364,7 @@
                 ).format(base=path_addition, asm=asm_sz, obj=obj_sz,
                          triple=triple),
                      echo=args.verbose)
-        if not args.sandbox:
+        if native:
             shellcmd((
                 '{objcopy} --redefine-sym _start=_user_start {obj}'
                 ).format(objcopy=objcopy, obj=obj_sz), echo=args.verbose)
@@ -317,69 +432,17 @@
             '{objcopy} --globalize-symbol={start} ' +
             '--globalize-symbol=__Sz_block_profile_info {partial}'
             ).format(objcopy=objcopy, partial=obj_partial,
-                     start='_start' if args.sandbox else '_user_start'),
+                     start=get_sfi_string(args, '_start', '_start',
+                                          '_user_start')),
                  echo=args.verbose)
 
     # Run the linker regardless of hybrid mode.
     if args.sandbox:
-        assert args.target in ('x8632', 'arm32'), \
-            '-sandbox is not available for %s' % args.target
-        target_lib_dir = {
-          'arm32': 'arm',
-          'x8632': 'x86-32',
-        }[args.target]
-        linklib = ('{root}/toolchain/linux_x86/pnacl_newlib_raw/translator/' +
-                   '{target_dir}/lib').format(root=nacl_root,
-                                              target_dir=target_lib_dir)
-        shellcmd((
-            '{gold} -nostdlib --no-fix-cortex-a8 --eh-frame-hdr -z text ' +
-            '--build-id --entry=__pnacl_start -static ' +
-            '{linklib}/crtbegin.o {partial} ' +
-            '{root}/toolchain_build/src/subzero/build/runtime/' +
-            'szrt_sb_{target}.o ' +
-            '{linklib}/libpnacl_irt_shim_dummy.a --start-group ' +
-            '{linklib}/libgcc.a {linklib}/libcrt_platform.a ' +
-            '--end-group {linklib}/crtend.o --undefined=_start ' +
-            '--defsym=__Sz_AbsoluteZero=0 ' +
-            '-o {exe}'
-            ).format(gold=gold, linklib=linklib, partial=obj_partial, exe=exe,
-                     root=nacl_root, target=args.target),
-                 echo=args.verbose)
+        LinkSandbox([obj_partial], exe, args.target, args.verbose)
+    elif args.nonsfi:
+        LinkNonsfi([obj_partial], exe, args.target, args.verbose)
     else:
-        linker = {
-          'arm32': '/usr/bin/arm-linux-gnueabihf-g++',
-          'x8632': ('{root}/../third_party/llvm-build/Release+Asserts/bin/clang'
-                   ).format(root=nacl_root),
-          'x8664': ('{root}/../third_party/llvm-build/Release+Asserts/bin/clang'
-                   ).format(root=nacl_root)
-        }[args.target]
-
-        extra_linker_args = ' '.join({
-          'arm32': ['-mcpu=cortex-a9'],
-          'x8632': ['-m32'],
-          'x8664': ['-mx32']
-        }[args.target])
-
-        lib_dir = {
-          'arm32': 'arm-linux',
-          'x8632': 'x86-32-linux',
-          'x8664': 'x86-64-linux',
-        }[args.target]
-
-        shellcmd((
-            '{ld} {ld_extra_args} {partial} -o {exe} ' +
-            # Keep the rest of this command line (except szrt_native_x8632.o) in
-            # sync with RunHostLD() in pnacl-translate.py.
-            '{root}/toolchain/linux_x86/pnacl_newlib_raw/translator/' +
-            '{lib_dir}/lib/' +
-            '{{unsandboxed_irt,irt_random,irt_query_list}}.o ' +
-            '{root}/toolchain_build/src/subzero/build/runtime/' +
-            'szrt_native_{target}.o -lpthread -lrt ' +
-            '-Wl,--defsym=__Sz_AbsoluteZero=0'
-            ).format(ld=linker, ld_extra_args=extra_linker_args,
-                     partial=obj_partial, exe=exe, root=nacl_root,
-                     target=args.target, lib_dir=lib_dir),
-                 echo=args.verbose)
+        LinkNative([obj_partial], exe, args.target, args.verbose)
 
     # Put the extra verbose printing at the end.
     if args.verbose and hybrid:
diff --git a/pydir/szbuild_spec2k.py b/pydir/szbuild_spec2k.py
index 0c2f38a..b9cf15d 100755
--- a/pydir/szbuild_spec2k.py
+++ b/pydir/szbuild_spec2k.py
@@ -16,6 +16,8 @@
     './run_all.sh RunBenchmarks SetupGccX8632Opt {train|ref} ...'
     -- or --
     './run_all.sh RunBenchmarks SetupPnaclX8632Opt {train|ref} ...'
+    -- or --
+    './run_all.sh RunBenchmarks SetupNonsfiX8632Opt {train|ref} ...'
     """
     nacl_root = FindBaseNaCl()
     # Use the same default ordering as spec2k/run_all.sh.
@@ -45,7 +47,9 @@
     run_all_target = target_map[args.target] # fail if target not listed above
 
     suffix = (
-        'pnacl.opt.{target}' if args.sandbox else 'gcc.opt.{target}').format(
+        'pnacl.opt.{target}' if args.sandbox else
+        'nonsfi.opt.{target}' if args.nonsfi else
+        'gcc.opt.{target}').format(
              target=run_all_target);
     for comp in args.comps:
         name = os.path.splitext(comp)[1] or comp
@@ -61,7 +65,9 @@
                                       suffix=suffix))
     if args.run:
         os.chdir('{root}/tests/spec2k'.format(root=FindBaseNaCl()))
-        setup = 'SetupGcc' + {
+        setup = 'Setup' + ('Pnacl' if args.sandbox else
+                           'Nonsfi' if args.nonsfi else
+                           'Gcc') + {
             'arm32': 'Arm',
             'x8632': 'X8632',
             'x8664': 'X8664'}[args.target] + 'Opt'
diff --git a/pydir/utils.py b/pydir/utils.py
index 1141e3e..6fdedfa 100644
--- a/pydir/utils.py
+++ b/pydir/utils.py
@@ -20,3 +20,12 @@
         return None
     last_index = len(path_list) - path_list[::-1].index(nacl)
     return os.sep.join(path_list[:last_index])
+
+def get_sfi_string(args, sb_ret, nonsfi_ret, native_ret):
+    """Return a value depending on args.sandbox and args.nonsfi."""
+    if args.sandbox:
+        assert(not args.nonsfi)
+        return sb_ret
+    if args.nonsfi:
+        return nonsfi_ret
+    return native_ret
diff --git a/runtime/szrt_asm_arm32.s b/runtime/szrt_asm_arm32.s
new file mode 100644
index 0000000..1d01909
--- /dev/null
+++ b/runtime/szrt_asm_arm32.s
@@ -0,0 +1,16 @@
+##===- subzero/runtime/szrt_asm_arm32.s - Subzero runtime asm helpers------===##
+##
+##                        The Subzero Code Generator
+##
+## This file is distributed under the University of Illinois Open Source
+## License. See LICENSE.TXT for details.
+##
+##===----------------------------------------------------------------------===##
+##
+## This file provides an assembly implementation of various helpers needed by
+## the Subzero arm32 runtime.
+##
+##===----------------------------------------------------------------------===##
+
+	.text
+	.p2alignl 4,0xE7FEDEF0
diff --git a/runtime/szrt_asm_x8632.s b/runtime/szrt_asm_x8632.s
new file mode 100644
index 0000000..518acef
--- /dev/null
+++ b/runtime/szrt_asm_x8632.s
@@ -0,0 +1,51 @@
+##===- subzero/runtime/szrt_asm_x8632.s - Subzero runtime asm helpers------===##
+##
+##                        The Subzero Code Generator
+##
+## This file is distributed under the University of Illinois Open Source
+## License. See LICENSE.TXT for details.
+##
+##===----------------------------------------------------------------------===##
+##
+## This file provides an assembly implementation of various helpers needed by
+## the Subzero x8632 runtime.
+##
+##===----------------------------------------------------------------------===##
+
+	.text
+	.p2align 5,0xf4
+
+	.globl __Sz_getIP_eax
+__Sz_getIP_eax:
+	movl (%esp), %eax
+	ret
+
+	.globl __Sz_getIP_ecx
+__Sz_getIP_ecx:
+	movl (%esp), %ecx
+	ret
+
+	.globl __Sz_getIP_edx
+__Sz_getIP_edx:
+	movl (%esp), %edx
+	ret
+
+	.globl __Sz_getIP_ebx
+__Sz_getIP_ebx:
+	movl (%esp), %ebx
+	ret
+
+	.globl __Sz_getIP_ebp
+__Sz_getIP_ebp:
+	movl (%esp), %ebp
+	ret
+
+	.globl __Sz_getIP_esi
+__Sz_getIP_esi:
+	movl (%esp), %esi
+	ret
+
+	.globl __Sz_getIP_edi
+__Sz_getIP_edi:
+	movl (%esp), %edi
+	ret
diff --git a/runtime/szrt_asm_x8664.s b/runtime/szrt_asm_x8664.s
new file mode 100644
index 0000000..5a33685
--- /dev/null
+++ b/runtime/szrt_asm_x8664.s
@@ -0,0 +1,16 @@
+##===- subzero/runtime/szrt_asm_x8664.s - Subzero runtime asm helpers------===##
+##
+##                        The Subzero Code Generator
+##
+## This file is distributed under the University of Illinois Open Source
+## License. See LICENSE.TXT for details.
+##
+##===----------------------------------------------------------------------===##
+##
+## This file provides an assembly implementation of various helpers needed by
+## the Subzero x8664 runtime.
+##
+##===----------------------------------------------------------------------===##
+
+	.text
+	.p2align 5,0xf4
diff --git a/src/IceAssemblerARM32.cpp b/src/IceAssemblerARM32.cpp
index cb3816f..8448612 100644
--- a/src/IceAssemblerARM32.cpp
+++ b/src/IceAssemblerARM32.cpp
@@ -499,7 +499,7 @@
          "mov" << (kind() == llvm::ELF::R_ARM_MOVW_ABS_NC ? "w" : "t") << "\t"
       << RegARM32::getRegName((Inst >> kRdShift) & 0xF)
       << ", #:" << (kind() == llvm::ELF::R_ARM_MOVW_ABS_NC ? "lower" : "upper")
-      << "16:" << symbol(Ctx) << "\t@ .word "
+      << "16:" << symbol(Ctx, &Asm) << "\t@ .word "
       << llvm::format_hex_no_prefix(Inst, 8) << "\n";
   return InstARM32::InstSize;
 }
@@ -522,7 +522,7 @@
   Ostream &Str = Ctx->getStrEmit();
   IValueT Inst = Asm.load<IValueT>(position());
   Str << "\t"
-         "bl\t" << symbol(Ctx) << "\t@ .word "
+         "bl\t" << symbol(Ctx, &Asm) << "\t@ .word "
       << llvm::format_hex_no_prefix(Inst, 8) << "\n";
   return InstARM32::InstSize;
 }
diff --git a/src/IceAssemblerX86Base.h b/src/IceAssemblerX86Base.h
index 5a72b6c..9da66a3 100644
--- a/src/IceAssemblerX86Base.h
+++ b/src/IceAssemblerX86Base.h
@@ -176,7 +176,7 @@
   bool fixupIsPCRel(FixupKind Kind) const override {
     // Currently assuming this is the only PC-rel relocation type used.
     // TODO(jpp): Traits.PcRelTypes.count(Kind) != 0
-    return Kind == Traits::PcRelFixup;
+    return Kind == Traits::FK_PcRel;
   }
 
   // Operations to emit GPR instructions (and dispatch on operand type).
diff --git a/src/IceAssemblerX86BaseImpl.h b/src/IceAssemblerX86BaseImpl.h
index 040fed8..912130a 100644
--- a/src/IceAssemblerX86BaseImpl.h
+++ b/src/IceAssemblerX86BaseImpl.h
@@ -127,7 +127,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   intptr_t call_start = Buffer.getPosition();
   emitUint8(0xE8);
-  emitFixup(this->createFixup(Traits::PcRelFixup, label));
+  emitFixup(this->createFixup(Traits::FK_PcRel, label));
   emitInt32(-4);
   assert((Buffer.getPosition() - call_start) == kCallExternalLabelSize);
   (void)call_start;
@@ -138,7 +138,7 @@
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   intptr_t call_start = Buffer.getPosition();
   emitUint8(0xE8);
-  emitFixup(this->createFixup(Traits::PcRelFixup, AssemblerFixup::NullSymbol));
+  emitFixup(this->createFixup(Traits::FK_PcRel, AssemblerFixup::NullSymbol));
   emitInt32(abs_address.value() - 4);
   assert((Buffer.getPosition() - call_start) == kCallExternalLabelSize);
   (void)call_start;
@@ -3098,10 +3098,11 @@
 template <typename TraitsType>
 void AssemblerX86Base<TraitsType>::j(BrCond condition,
                                      const ConstantRelocatable *label) {
+  llvm::report_fatal_error("Untested - please verify and then reenable.");
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0x0F);
   emitUint8(0x80 + condition);
-  emitFixup(this->createFixup(Traits::PcRelFixup, label));
+  emitFixup(this->createFixup(Traits::FK_PcRel, label));
   emitInt32(-4);
 }
 
@@ -3139,9 +3140,10 @@
 
 template <typename TraitsType>
 void AssemblerX86Base<TraitsType>::jmp(const ConstantRelocatable *label) {
+  llvm::report_fatal_error("Untested - please verify and then reenable.");
   AssemblerBuffer::EnsureCapacity ensured(&Buffer);
   emitUint8(0xE9);
-  emitFixup(this->createFixup(Traits::PcRelFixup, label));
+  emitFixup(this->createFixup(Traits::FK_PcRel, label));
   emitInt32(-4);
 }
 
diff --git a/src/IceClFlags.cpp b/src/IceClFlags.cpp
index bd7f0fc..ed5a71d 100644
--- a/src/IceClFlags.cpp
+++ b/src/IceClFlags.cpp
@@ -256,6 +256,9 @@
     TranslateOnly("translate-only",
                   cl::desc("Translate only the given function"), cl::init(""));
 
+/// Enable Non-SFI mode.
+cl::opt<bool> UseNonsfi("nonsfi", cl::desc("Enable Non-SFI mode"));
+
 /// Use sandboxing.
 cl::opt<bool> UseSandboxing("sandbox", cl::desc("Use sandboxing"));
 
@@ -455,6 +458,7 @@
   OutFlags.SkipUnimplemented = false;
   OutFlags.SubzeroTimingEnabled = false;
   OutFlags.TimeEachFunction = false;
+  OutFlags.UseNonsfi = false;
   OutFlags.UseSandboxing = false;
   // Enum and integer fields.
   OutFlags.Opt = Opt_m1;
@@ -531,6 +535,7 @@
   OutFlags.setTimeEachFunction(::TimeEachFunction);
   OutFlags.setTimingFocusOn(::TimingFocusOn);
   OutFlags.setTranslateOnly(::TranslateOnly);
+  OutFlags.setUseNonsfi(::UseNonsfi);
   OutFlags.setUseSandboxing(::UseSandboxing);
   OutFlags.setVerboseFocusOn(::VerboseFocusOn);
   OutFlags.setOutFileType(::OutFileType);
diff --git a/src/IceClFlags.h b/src/IceClFlags.h
index 533f23e..d89186c 100644
--- a/src/IceClFlags.h
+++ b/src/IceClFlags.h
@@ -181,6 +181,11 @@
   /// Set ClFlags::TimeEachFunction to a new value
   void setTimeEachFunction(bool NewValue) { TimeEachFunction = NewValue; }
 
+  /// Get the value of ClFlags::UseNonsfi
+  bool getUseNonsfi() const { return UseNonsfi; }
+  /// Set ClFlags::UseNonsfi to a new value
+  void setUseNonsfi(bool NewValue) { UseNonsfi = NewValue; }
+
   /// Get the value of ClFlags::UseSandboxing
   bool getUseSandboxing() const { return UseSandboxing; }
   /// Set ClFlags::UseSandboxing to a new value
@@ -415,6 +420,8 @@
   bool SubzeroTimingEnabled;
   /// see anonymous_namespace{IceClFlags.cpp}::TimeEachFunction
   bool TimeEachFunction;
+  /// see anonymous_namespace{IceClFlags.cpp}::UseNonsfi
+  bool UseNonsfi;
   /// see anonymous_namespace{IceClFlags.cpp}::UseSandboxing
   bool UseSandboxing;
   /// see anonymous_namespace{IceClFlags.cpp}::OLevel
diff --git a/src/IceELFObjectWriter.cpp b/src/IceELFObjectWriter.cpp
index 5d81635..cb36e3d 100644
--- a/src/IceELFObjectWriter.cpp
+++ b/src/IceELFObjectWriter.cpp
@@ -285,7 +285,8 @@
 
 void ELFObjectWriter::writeDataSection(const VariableDeclarationList &Vars,
                                        FixupKind RelocationKind,
-                                       const IceString &SectionSuffix) {
+                                       const IceString &SectionSuffix,
+                                       bool IsPIC) {
   assert(!SectionNumbersAssigned);
   VariableDeclarationList VarsBySection[ELFObjectWriter::NumSectionTypes];
   for (auto &SectionList : VarsBySection)
@@ -295,7 +296,7 @@
   size_t I = 0;
   for (auto &SectionList : VarsBySection) {
     writeDataOfType(static_cast<SectionType>(I++), SectionList, RelocationKind,
-                    SectionSuffix);
+                    SectionSuffix, IsPIC);
   }
 }
 
@@ -311,7 +312,8 @@
 void ELFObjectWriter::writeDataOfType(SectionType ST,
                                       const VariableDeclarationList &Vars,
                                       FixupKind RelocationKind,
-                                      const IceString &SectionSuffix) {
+                                      const IceString &SectionSuffix,
+                                      bool IsPIC) {
   if (Vars.empty())
     return;
   ELFDataSection *Section;
@@ -326,8 +328,9 @@
   // Lift this out, so it can be re-used if we do fdata-sections?
   switch (ST) {
   case ROData: {
-    const IceString SectionName = MangleSectionName(".rodata", SectionSuffix);
-    constexpr Elf64_Xword ShFlags = SHF_ALLOC;
+    const IceString SectionName =
+        MangleSectionName(IsPIC ? ".data.rel.ro" : ".rodata", SectionSuffix);
+    const Elf64_Xword ShFlags = SHF_ALLOC | (IsPIC ? SHF_WRITE : 0);
     Section = createSection<ELFDataSection>(SectionName, SHT_PROGBITS, ShFlags,
                                             ShAddralign, ShEntsize);
     Section->setFileOffset(alignFileOffset(ShAddralign));
@@ -557,14 +560,14 @@
 }
 
 void ELFObjectWriter::writeJumpTable(const JumpTableData &JT,
-                                     FixupKind RelocationKind) {
+                                     FixupKind RelocationKind, bool IsPIC) {
   ELFDataSection *Section;
   ELFRelocationSection *RelSection;
   const Elf64_Xword PointerSize = typeWidthInBytes(getPointerType());
   const Elf64_Xword ShAddralign = PointerSize;
   const Elf64_Xword ShEntsize = PointerSize;
-  const IceString SectionName =
-      MangleSectionName(".rodata", JT.getFunctionName() + "$jumptable");
+  const IceString SectionName = MangleSectionName(
+      IsPIC ? ".data.rel.ro" : ".rodata", JT.getFunctionName() + "$jumptable");
   Section = createSection<ELFDataSection>(SectionName, SHT_PROGBITS, SHF_ALLOC,
                                           ShAddralign, ShEntsize);
   Section->setFileOffset(alignFileOffset(ShAddralign));
diff --git a/src/IceELFObjectWriter.h b/src/IceELFObjectWriter.h
index f2171b1..58be83c 100644
--- a/src/IceELFObjectWriter.h
+++ b/src/IceELFObjectWriter.h
@@ -62,7 +62,7 @@
   /// RelocationKind for any relocations.
   void writeDataSection(const VariableDeclarationList &Vars,
                         FixupKind RelocationKind,
-                        const IceString &SectionSuffix);
+                        const IceString &SectionSuffix, bool IsPIC);
 
   /// Copy data of a function's text section to file and note the offset of the
   /// symbol's definition in the symbol table. Copy the text fixups for use
@@ -77,7 +77,8 @@
   template <typename ConstType> void writeConstantPool(Type Ty);
 
   /// Write a jump table and register fixups for the target addresses.
-  void writeJumpTable(const JumpTableData &JT, FixupKind RelocationKind);
+  void writeJumpTable(const JumpTableData &JT, FixupKind RelocationKind,
+                      bool IsPIC);
 
   /// Populate the symbol table with a list of external/undefined symbols.
   void setUndefinedSyms(const ConstantList &UndefSyms);
@@ -153,8 +154,8 @@
   /// SectionType.
   void writeDataOfType(SectionType SectionType,
                        const VariableDeclarationList &Vars,
-                       FixupKind RelocationKind,
-                       const IceString &SectionSuffix);
+                       FixupKind RelocationKind, const IceString &SectionSuffix,
+                       bool IsPIC);
 
   /// Write the final relocation sections given the final symbol table. May also
   /// be able to seek around the file and resolve function calls that are for
diff --git a/src/IceELFSection.h b/src/IceELFSection.h
index 6d24e4a..ef1b299 100644
--- a/src/IceELFSection.h
+++ b/src/IceELFSection.h
@@ -352,10 +352,12 @@
                                      const ELFSymbolTableSection *SymTab) {
   for (const AssemblerFixup &Fixup : Fixups) {
     const ELFSym *Symbol;
-    if (Fixup.isNullSymbol())
+    if (Fixup.isNullSymbol()) {
       Symbol = SymTab->getNullSymbol();
-    else
-      Symbol = SymTab->findSymbol(Fixup.symbol(&Ctx));
+    } else {
+      constexpr Assembler *Asm = nullptr;
+      Symbol = SymTab->findSymbol(Fixup.symbol(&Ctx, Asm));
+    }
     if (!Symbol)
       llvm::report_fatal_error("Missing symbol mentioned in reloc");
 
diff --git a/src/IceFixups.cpp b/src/IceFixups.cpp
index 59bd4f3..ef594ac 100644
--- a/src/IceFixups.cpp
+++ b/src/IceFixups.cpp
@@ -29,7 +29,8 @@
   return 0;
 }
 
-IceString AssemblerFixup::symbol(const GlobalContext *Ctx) const {
+IceString AssemblerFixup::symbol(const GlobalContext *Ctx,
+                                 const Assembler *Asm) const {
   std::string Buffer;
   llvm::raw_string_ostream Str(Buffer);
   const Constant *C = value_;
@@ -39,6 +40,9 @@
       Str << CR->getName();
     else
       Str << Ctx->mangleName(CR->getName());
+    if (Asm && !Asm->fixupIsPCRel(kind()) && Ctx->getFlags().getUseNonsfi()) {
+      Str << "@GOTOFF";
+    }
   } else {
     // NOTE: currently only float/doubles are put into constant pools. In the
     // future we may put integers as well.
@@ -57,7 +61,7 @@
   if (isNullSymbol())
     Str << "__Sz_AbsoluteZero";
   else
-    Str << symbol(Ctx);
+    Str << symbol(Ctx, &Asm);
   RelocOffsetT Offset = Asm.load<RelocOffsetT>(position());
   if (Offset)
     Str << " + " << Offset;
diff --git a/src/IceFixups.h b/src/IceFixups.h
index a83a00e..aeb0fb0 100644
--- a/src/IceFixups.h
+++ b/src/IceFixups.h
@@ -44,7 +44,7 @@
   void set_kind(FixupKind Kind) { kind_ = Kind; }
 
   RelocOffsetT offset() const;
-  IceString symbol(const GlobalContext *Ctx) const;
+  IceString symbol(const GlobalContext *Ctx, const Assembler *Asm) const;
 
   static const Constant *NullSymbol;
   bool isNullSymbol() const { return value_ == NullSymbol; }
diff --git a/src/IceGlobalContext.cpp b/src/IceGlobalContext.cpp
index 58406f3..62a70e4 100644
--- a/src/IceGlobalContext.cpp
+++ b/src/IceGlobalContext.cpp
@@ -271,7 +271,7 @@
   ProfileBlockInfoVarDecl->setSuppressMangling();
   ProfileBlockInfoVarDecl->setLinkage(llvm::GlobalValue::ExternalLinkage);
 
-  TargetLowering::staticInit(Flags.getTargetArch());
+  TargetLowering::staticInit(Flags);
 }
 
 void GlobalContext::translateFunctions() {
diff --git a/src/IceInst.cpp b/src/IceInst.cpp
index e4f697a..f336ba9 100644
--- a/src/IceInst.cpp
+++ b/src/IceInst.cpp
@@ -493,10 +493,11 @@
     addSource(Src);
 }
 
-InstFakeUse::InstFakeUse(Cfg *Func, Variable *Src)
-    : InstHighLevel(Func, Inst::FakeUse, 1, nullptr) {
+InstFakeUse::InstFakeUse(Cfg *Func, Variable *Src, uint32_t Weight)
+    : InstHighLevel(Func, Inst::FakeUse, Weight, nullptr) {
   assert(Src);
-  addSource(Src);
+  for (uint32_t i = 0; i < Weight; ++i)
+    addSource(Src);
 }
 
 InstFakeKill::InstFakeKill(Cfg *Func, const Inst *Linked)
diff --git a/src/IceInst.h b/src/IceInst.h
index f9265f6..f8ca48c 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -854,15 +854,18 @@
 /// FakeUse instruction. This creates a fake use of a variable, to keep the
 /// instruction that produces that variable from being dead-code eliminated.
 /// This is useful in a variety of lowering situations. The FakeUse instruction
-/// has no dest, so it can itself never be dead-code eliminated.
+/// has no dest, so it can itself never be dead-code eliminated.  A weight can
+/// be provided to provide extra bias to the register allocator - for simplicity
+/// of implementation, weight=N is handled by holding N copies of the variable
+/// as source operands.
 class InstFakeUse : public InstHighLevel {
   InstFakeUse() = delete;
   InstFakeUse(const InstFakeUse &) = delete;
   InstFakeUse &operator=(const InstFakeUse &) = delete;
 
 public:
-  static InstFakeUse *create(Cfg *Func, Variable *Src) {
-    return new (Func->allocate<InstFakeUse>()) InstFakeUse(Func, Src);
+  static InstFakeUse *create(Cfg *Func, Variable *Src, uint32_t Weight = 1) {
+    return new (Func->allocate<InstFakeUse>()) InstFakeUse(Func, Src, Weight);
   }
   void emit(const Cfg *Func) const override;
   void emitIAS(const Cfg * /* Func */) const override {}
@@ -870,7 +873,7 @@
   static bool classof(const Inst *Inst) { return Inst->getKind() == FakeUse; }
 
 private:
-  InstFakeUse(Cfg *Func, Variable *Src);
+  InstFakeUse(Cfg *Func, Variable *Src, uint32_t Weight);
 };
 
 /// FakeKill instruction. This "kills" a set of variables by modeling a trivial
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index bf26f84..a4218b7 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -74,13 +74,11 @@
     Str << "<OperandX8632>";
 }
 
-TargetX8632Traits::X86OperandMem::X86OperandMem(Cfg *Func, Type Ty,
-                                                Variable *Base,
-                                                Constant *Offset,
-                                                Variable *Index, uint16_t Shift,
-                                                SegmentRegisters SegmentReg)
+TargetX8632Traits::X86OperandMem::X86OperandMem(
+    Cfg *Func, Type Ty, Variable *Base, Constant *Offset, Variable *Index,
+    uint16_t Shift, SegmentRegisters SegmentReg, bool IsPIC)
     : X86Operand(kMem, Ty), Base(Base), Offset(Offset), Index(Index),
-      Shift(Shift), SegmentReg(SegmentReg), Randomized(false) {
+      Shift(Shift), SegmentReg(SegmentReg), IsPIC(IsPIC) {
   assert(Shift <= 3);
   Vars = nullptr;
   NumVars = 0;
@@ -100,9 +98,9 @@
 }
 
 namespace {
-static int32_t
-GetRematerializableOffset(Variable *Var,
-                          const Ice::X8632::TargetX8632 *Target) {
+
+int32_t GetRematerializableOffset(Variable *Var,
+                                  const Ice::X8632::TargetX8632 *Target) {
   int32_t Disp = Var->getStackOffset();
   SizeT RegNum = static_cast<SizeT>(Var->getRegNum());
   if (RegNum == Target->getFrameReg()) {
@@ -112,11 +110,29 @@
   }
   return Disp;
 }
+
+void validateMemOperandPIC(const TargetX8632Traits::X86OperandMem *Mem,
+                           bool UseNonsfi) {
+  if (!BuildDefs::asserts())
+    return;
+  const bool HasCR =
+      Mem->getOffset() && llvm::isa<ConstantRelocatable>(Mem->getOffset());
+  (void)HasCR;
+  const bool IsPIC = Mem->getIsPIC();
+  (void)IsPIC;
+  if (UseNonsfi)
+    assert(HasCR == IsPIC);
+  else
+    assert(!IsPIC);
+}
+
 } // end of anonymous namespace
 
 void TargetX8632Traits::X86OperandMem::emit(const Cfg *Func) const {
   if (!BuildDefs::dump())
     return;
+  const bool UseNonsfi = Func->getContext()->getFlags().getUseNonsfi();
+  validateMemOperandPIC(this, UseNonsfi);
   const auto *Target =
       static_cast<const ::Ice::X8632::TargetX8632 *>(Func->getTarget());
   // If the base is rematerializable, we need to replace it with the correct
@@ -137,9 +153,9 @@
   }
   // Emit as Offset(Base,Index,1<<Shift). Offset is emitted without the leading
   // '$'. Omit the (Base,Index,1<<Shift) part if Base==nullptr.
-  if (getOffset() == 0 && Disp == 0) {
+  if (getOffset() == nullptr && Disp == 0) {
     // No offset, emit nothing.
-  } else if (getOffset() == 0 && Disp != 0) {
+  } else if (getOffset() == nullptr && Disp != 0) {
     Str << Disp;
   } else if (const auto *CI = llvm::dyn_cast<ConstantInteger32>(getOffset())) {
     if (getBase() == nullptr || CI->getValue() || Disp != 0)
@@ -150,7 +166,7 @@
     // TODO(sehr): ConstantRelocatable still needs updating for
     // rematerializable base/index and Disp.
     assert(Disp == 0);
-    CR->emitWithoutPrefix(Target);
+    CR->emitWithoutPrefix(Target, UseNonsfi ? "@GOTOFF" : "");
   } else {
     llvm_unreachable("Invalid offset type for x86 mem operand");
   }
@@ -245,9 +261,11 @@
 TargetX8632Traits::Address TargetX8632Traits::X86OperandMem::toAsmAddress(
     TargetX8632Traits::Assembler *Asm,
     const Ice::TargetLowering *TargetLowering) const {
-  int32_t Disp = 0;
   const auto *Target =
       static_cast<const ::Ice::X8632::TargetX8632 *>(TargetLowering);
+  const bool UseNonsfi = Target->getGlobalContext()->getFlags().getUseNonsfi();
+  validateMemOperandPIC(this, UseNonsfi);
+  int32_t Disp = 0;
   if (getBase() && getBase()->isRematerializable()) {
     Disp += GetRematerializableOffset(getBase(), Target);
   }
@@ -264,7 +282,7 @@
     } else if (const auto CR =
                    llvm::dyn_cast<ConstantRelocatable>(getOffset())) {
       Disp += CR->getOffset();
-      Fixup = Asm->createFixup(RelFixup, CR);
+      Fixup = Asm->createFixup(Target->getAbsFixup(), CR);
     } else {
       llvm_unreachable("Unexpected offset type");
     }
diff --git a/src/IceInstX8664.cpp b/src/IceInstX8664.cpp
index e7130ed..593b39f 100644
--- a/src/IceInstX8664.cpp
+++ b/src/IceInstX8664.cpp
@@ -66,9 +66,10 @@
 TargetX8664Traits::X86OperandMem::X86OperandMem(Cfg *Func, Type Ty,
                                                 Variable *Base,
                                                 Constant *Offset,
-                                                Variable *Index, uint16_t Shift)
+                                                Variable *Index, uint16_t Shift,
+                                                bool IsPIC)
     : X86Operand(kMem, Ty), Base(Base), Offset(Offset), Index(Index),
-      Shift(Shift) {
+      Shift(Shift), IsPIC(IsPIC) {
   assert(Shift <= 3);
   Vars = nullptr;
   NumVars = 0;
@@ -133,7 +134,8 @@
     // TODO(sehr): ConstantRelocatable still needs updating for
     // rematerializable base/index and Disp.
     assert(Disp == 0);
-    CR->emitWithoutPrefix(Func->getTarget());
+    const bool UseNonsfi = Func->getContext()->getFlags().getUseNonsfi();
+    CR->emitWithoutPrefix(Func->getTarget(), UseNonsfi ? "@GOTOFF" : "");
   } else {
     llvm_unreachable("Invalid offset type for x86 mem operand");
   }
@@ -242,8 +244,8 @@
       Disp += static_cast<int32_t>(CI->getValue());
     } else if (const auto CR =
                    llvm::dyn_cast<ConstantRelocatable>(getOffset())) {
-      Disp += CR->getOffset();
-      Fixup = Asm->createFixup(RelFixup, CR);
+      Disp = CR->getOffset();
+      Fixup = Asm->createFixup(FK_Abs, CR);
     } else {
       llvm_unreachable("Unexpected offset type");
     }
diff --git a/src/IceInstX86Base.h b/src/IceInstX86Base.h
index 8cefabb..c4a9fe4 100644
--- a/src/IceInstX86Base.h
+++ b/src/IceInstX86Base.h
@@ -101,6 +101,7 @@
       FakeRMW,
       Fld,
       Fstp,
+      GetIP,
       Icmp,
       Idiv,
       Imul,
@@ -265,6 +266,26 @@
                    InstArithmetic::OpKind Op, Variable *Beacon);
   };
 
+  class InstX86GetIP final : public InstX86Base {
+    InstX86GetIP() = delete;
+    InstX86GetIP(const InstX86GetIP &) = delete;
+    InstX86GetIP &operator=(const InstX86GetIP &) = delete;
+
+  public:
+    static InstX86GetIP *create(Cfg *Func, Variable *Dest) {
+      return new (Func->allocate<InstX86GetIP>()) InstX86GetIP(Func, Dest);
+    }
+    void emit(const Cfg *Func) const override;
+    void emitIAS(const Cfg *Func) const override;
+    void dump(const Cfg *Func) const override;
+    static bool classof(const Inst *Inst) {
+      return InstX86Base::isClassof(Inst, InstX86Base::GetIP);
+    }
+
+  private:
+    InstX86GetIP(Cfg *Func, Variable *Dest);
+  };
+
   /// InstX86Label represents an intra-block label that is the target of an
   /// intra-block branch. The offset between the label and the branch must be
   /// fit into one byte (considered "near"). These are used for lowering i1
@@ -2771,6 +2792,7 @@
 ///
 /// using Insts = ::Ice::X86NAMESPACE::Insts<TraitsType>;
 template <typename TraitsType> struct Insts {
+  using GetIP = typename InstImpl<TraitsType>::InstX86GetIP;
   using FakeRMW = typename InstImpl<TraitsType>::InstX86FakeRMW;
   using Label = typename InstImpl<TraitsType>::InstX86Label;
 
diff --git a/src/IceInstX86BaseImpl.h b/src/IceInstX86BaseImpl.h
index 96ed422..addb599 100644
--- a/src/IceInstX86BaseImpl.h
+++ b/src/IceInstX86BaseImpl.h
@@ -24,6 +24,7 @@
 #include "IceInst.h"
 #include "IceOperand.h"
 #include "IceTargetLowering.h"
+#include "IceTargetLoweringX86Base.h"
 
 namespace Ice {
 
@@ -57,6 +58,10 @@
 }
 
 template <typename TraitsType>
+InstImpl<TraitsType>::InstX86GetIP::InstX86GetIP(Cfg *Func, Variable *Dest)
+    : InstX86Base(Func, InstX86Base::GetIP, 0, Dest) {}
+
+template <typename TraitsType>
 InstImpl<TraitsType>::InstX86Mul::InstX86Mul(Cfg *Func, Variable *Dest,
                                              Variable *Source1,
                                              Operand *Source2)
@@ -391,6 +396,55 @@
 }
 
 template <typename TraitsType>
+void InstImpl<TraitsType>::InstX86GetIP::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(this->getDest()->hasReg());
+  Str << "\t"
+         "addl\t$_GLOBAL_OFFSET_TABLE_, ";
+  this->getDest()->emit(Func);
+}
+
+template <typename TraitsType>
+void InstImpl<TraitsType>::InstX86GetIP::emitIAS(const Cfg *Func) const {
+  if (Func->getContext()->getFlags().getOutFileType() == FT_Iasm) {
+    // TODO(stichnot): Find a workaround for llvm-mc's inability to handle
+    // something like ".long _GLOBAL_OFFSET_TABLE_ + ." .  One possibility is to
+    // just use hybrid iasm output for this add instruction.
+    llvm::report_fatal_error(
+        "Iasm support for _GLOBAL_OFFSET_TABLE_ not implemented");
+  }
+  Assembler *Asm = Func->getAssembler<Assembler>();
+  assert(this->getDest()->hasReg());
+  GPRRegister Reg = Traits::getEncodedGPR(this->getDest()->getRegNum());
+  Constant *GlobalOffsetTable =
+      Func->getContext()->getConstantExternSym("_GLOBAL_OFFSET_TABLE_");
+  AssemblerFixup *Fixup = Asm->createFixup(Traits::FK_GotPC, GlobalOffsetTable);
+  intptr_t OrigPos = Asm->getBufferSize();
+  constexpr int32_t TempDisp = 0;
+  constexpr int32_t ImmediateWidth = 4;
+  // Emit the add instruction once, in a preliminary fashion, to find its total
+  // size.  TODO(stichnot): IceType_i32 should really be something that
+  // represents the target's pointer type.
+  Asm->add(IceType_i32, Reg, AssemblerImmediate(TempDisp, Fixup));
+  const int32_t Disp = Asm->getBufferSize() - OrigPos - ImmediateWidth;
+  // Now roll back and emit the add instruction again, this time with the
+  // correct displacement.
+  Asm->setBufferSize(OrigPos);
+  Asm->add(IceType_i32, Reg, AssemblerImmediate(Disp, Fixup));
+}
+
+template <typename TraitsType>
+void InstImpl<TraitsType>::InstX86GetIP::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  this->getDest()->dump(Func);
+  Str << " = call getIP";
+}
+
+template <typename TraitsType>
 void InstImpl<TraitsType>::InstX86Label::emit(const Cfg *Func) const {
   if (!BuildDefs::dump())
     return;
@@ -679,7 +733,8 @@
   } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src)) {
     (Asm->*(Emitter.GPRImm))(Ty, VarReg, AssemblerImmediate(Imm->getValue()));
   } else if (const auto *Reloc = llvm::dyn_cast<ConstantRelocatable>(Src)) {
-    AssemblerFixup *Fixup = Asm->createFixup(Traits::RelFixup, Reloc);
+    AssemblerFixup *Fixup =
+        Asm->createFixup(Traits::TargetLowering::getAbsFixup(), Reloc);
     (Asm->*(Emitter.GPRImm))(Ty, VarReg,
                              AssemblerImmediate(Reloc->getOffset(), Fixup));
   } else if (const auto *Split = llvm::dyn_cast<VariableSplit>(Src)) {
@@ -703,7 +758,8 @@
   } else if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src)) {
     (Asm->*(Emitter.AddrImm))(Ty, Addr, AssemblerImmediate(Imm->getValue()));
   } else if (const auto *Reloc = llvm::dyn_cast<ConstantRelocatable>(Src)) {
-    AssemblerFixup *Fixup = Asm->createFixup(Traits::RelFixup, Reloc);
+    AssemblerFixup *Fixup =
+        Asm->createFixup(Traits::TargetLowering::getAbsFixup(), Reloc);
     (Asm->*(Emitter.AddrImm))(Ty, Addr,
                               AssemblerImmediate(Reloc->getOffset(), Fixup));
   } else {
diff --git a/src/IceOperand.cpp b/src/IceOperand.cpp
index 21a4830..f45d227 100644
--- a/src/IceOperand.cpp
+++ b/src/IceOperand.cpp
@@ -487,9 +487,9 @@
   Target->emit(this);
 }
 
-void ConstantRelocatable::emitWithoutPrefix(
-    const TargetLowering *Target) const {
-  Target->emitWithoutPrefix(this);
+void ConstantRelocatable::emitWithoutPrefix(const TargetLowering *Target,
+                                            const char *Suffix) const {
+  Target->emitWithoutPrefix(this, Suffix);
 }
 
 void ConstantRelocatable::dump(const Cfg *Func, Ostream &Str) const {
diff --git a/src/IceOperand.h b/src/IceOperand.h
index 67ff09f..137d456 100644
--- a/src/IceOperand.h
+++ b/src/IceOperand.h
@@ -287,7 +287,8 @@
   bool getSuppressMangling() const { return SuppressMangling; }
   using Constant::emit;
   void emit(TargetLowering *Target) const final;
-  void emitWithoutPrefix(const TargetLowering *Target) const;
+  void emitWithoutPrefix(const TargetLowering *Target,
+                         const char *Suffix = "") const;
   using Constant::dump;
   void dump(const Cfg *Func, Ostream &Str) const override;
 
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index 69f7f12..04f77e6 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -39,7 +39,7 @@
 //       createTargetDataLowering(Ice::GlobalContext*);
 //   unique_ptr<Ice::TargetHeaderLowering>
 //       createTargetHeaderLowering(Ice::GlobalContext *);
-//   void staticInit();
+//   void staticInit(const ::Ice::ClFlags &Flags);
 // }
 #define SUBZERO_TARGET(X)                                                      \
   namespace X {                                                                \
@@ -49,7 +49,7 @@
   createTargetDataLowering(::Ice::GlobalContext *Ctx);                         \
   std::unique_ptr<::Ice::TargetHeaderLowering>                                 \
   createTargetHeaderLowering(::Ice::GlobalContext *Ctx);                       \
-  void staticInit();                                                           \
+  void staticInit(const ::Ice::ClFlags &Flags);                                \
   } // end of namespace X
 #include "llvm/Config/SZTargets.def"
 #undef SUBZERO_TARGET
@@ -129,7 +129,8 @@
   }
 }
 
-void TargetLowering::staticInit(TargetArch Target) {
+void TargetLowering::staticInit(const ClFlags &Flags) {
+  const TargetArch Target = Flags.getTargetArch();
   // Call the specified target's static initializer.
   switch (Target) {
   default:
@@ -141,8 +142,8 @@
       return;                                                                  \
     }                                                                          \
     InitGuard##X = true;                                                       \
-    ::X::staticInit();                                                         \
-  }
+    ::X::staticInit(Flags);                                                    \
+  } break;
 #include "llvm/Config/SZTargets.def"
 #undef SUBZERO_TARGET
   }
@@ -525,7 +526,8 @@
          Ctx->getFlags().getForceMemIntrinOpt();
 }
 
-void TargetLowering::emitWithoutPrefix(const ConstantRelocatable *C) const {
+void TargetLowering::emitWithoutPrefix(const ConstantRelocatable *C,
+                                       const char *Suffix) const {
   if (!BuildDefs::dump())
     return;
   Ostream &Str = Ctx->getStrEmit();
@@ -533,6 +535,7 @@
     Str << C->getName();
   else
     Str << Ctx->mangleName(C->getName());
+  Str << Suffix;
   RelocOffsetT Offset = C->getOffset();
   if (Offset) {
     if (Offset > 0)
@@ -541,14 +544,6 @@
   }
 }
 
-void TargetLowering::emit(const ConstantRelocatable *C) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Ctx->getStrEmit();
-  Str << getConstantPrefix();
-  emitWithoutPrefix(C);
-}
-
 std::unique_ptr<TargetDataLowering>
 TargetDataLowering::createLowering(GlobalContext *Ctx) {
   TargetArch Target = Ctx->getFlags().getTargetArch();
@@ -609,9 +604,12 @@
   Str << "\t.type\t" << MangledName << ",%object\n";
 
   const bool UseDataSections = Ctx->getFlags().getDataSections();
+  const bool UseNonsfi = Ctx->getFlags().getUseNonsfi();
   const IceString Suffix =
       dataSectionSuffix(SectionSuffix, MangledName, UseDataSections);
-  if (IsConstant)
+  if (IsConstant && UseNonsfi)
+    Str << "\t.section\t.data.rel.ro" << Suffix << ",\"aw\",%progbits\n";
+  else if (IsConstant)
     Str << "\t.section\t.rodata" << Suffix << ",\"a\",%progbits\n";
   else if (HasNonzeroInitializer)
     Str << "\t.section\t.data" << Suffix << ",\"aw\",%progbits\n";
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index e602f04..799872e 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -152,7 +152,9 @@
   TargetLowering &operator=(const TargetLowering &) = delete;
 
 public:
-  static void staticInit(TargetArch Target);
+  static void staticInit(const ClFlags &Flags);
+  // Each target must define a public static method:
+  //   static void staticInit(const ClFlags &Flags);
 
   static std::unique_ptr<TargetLowering> createLowering(TargetArch Target,
                                                         Cfg *Func);
@@ -242,6 +244,8 @@
   SizeT makeNextLabelNumber() { return NextLabelNumber++; }
   SizeT makeNextJumpTableNumber() { return NextJumpTableNumber++; }
   LoweringContext &getContext() { return Context; }
+  Cfg *getFunc() const { return Func; }
+  GlobalContext *getGlobalContext() const { return Ctx; }
 
   enum RegSet {
     RegSet_None = 0,
@@ -274,15 +278,15 @@
 
   virtual void emitVariable(const Variable *Var) const = 0;
 
-  void emitWithoutPrefix(const ConstantRelocatable *CR) const;
-  void emit(const ConstantRelocatable *CR) const;
-  virtual const char *getConstantPrefix() const = 0;
+  void emitWithoutPrefix(const ConstantRelocatable *CR,
+                         const char *Suffix = "") const;
 
-  virtual void emit(const ConstantUndef *C) const = 0;
   virtual void emit(const ConstantInteger32 *C) const = 0;
   virtual void emit(const ConstantInteger64 *C) const = 0;
   virtual void emit(const ConstantFloat *C) const = 0;
   virtual void emit(const ConstantDouble *C) const = 0;
+  virtual void emit(const ConstantUndef *C) const = 0;
+  virtual void emit(const ConstantRelocatable *CR) const = 0;
 
   /// Performs target-specific argument lowering.
   virtual void lowerArguments() = 0;
@@ -423,6 +427,7 @@
   const static constexpr char *H_fptoui_f64_i64 = "__Sz_fptoui_f64_i64";
   const static constexpr char *H_frem_f32 = "fmodf";
   const static constexpr char *H_frem_f64 = "fmod";
+  const static constexpr char *H_getIP_prefix = "__Sz_getIP_";
   const static constexpr char *H_sdiv_i32 = "__divsi3";
   const static constexpr char *H_sdiv_i64 = "__divdi3";
   const static constexpr char *H_sitofp_i64_f32 = "__Sz_sitofp_i64_f32";
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index 551fed3..878f95e 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -50,7 +50,9 @@
   return ::Ice::ARM32::TargetHeaderARM32::create(Ctx);
 }
 
-void staticInit() { ::Ice::ARM32::TargetARM32::staticInit(); }
+void staticInit(const ::Ice::ClFlags &Flags) {
+  ::Ice::ARM32::TargetARM32::staticInit(Flags);
+}
 } // end of namespace ARM32
 
 namespace Ice {
@@ -233,7 +235,8 @@
     : TargetLowering(Func), NeedSandboxing(Ctx->getFlags().getUseSandboxing()),
       CPUFeatures(Func->getContext()->getFlags()) {}
 
-void TargetARM32::staticInit() {
+void TargetARM32::staticInit(const ClFlags &Flags) {
+  (void)Flags;
   // Limit this size (or do all bitsets need to be the same width)???
   llvm::SmallBitVector IntegerRegisters(RegARM32::Reg_NUM);
   llvm::SmallBitVector I64PairRegisters(RegARM32::Reg_NUM);
@@ -897,7 +900,7 @@
   const Type VarTy = Var->getType();
   Str << "[" << getRegName(BaseRegNum, VarTy);
   if (Offset != 0) {
-    Str << ", " << getConstantPrefix() << Offset;
+    Str << ", #" << Offset;
   }
   Str << "]";
 }
@@ -5706,7 +5709,7 @@
   if (!BuildDefs::dump())
     return;
   Ostream &Str = Ctx->getStrEmit();
-  Str << getConstantPrefix() << C->getValue();
+  Str << "#" << C->getValue();
 }
 
 void TargetARM32::emit(const ConstantInteger64 *) const {
@@ -5727,6 +5730,14 @@
   llvm::report_fatal_error("undef value encountered by emitter.");
 }
 
+void TargetARM32::emit(const ConstantRelocatable *C) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Ctx->getStrEmit();
+  Str << "#";
+  emitWithoutPrefix(C);
+}
+
 void TargetARM32::lowerInt1ForSelect(Variable *Dest, Operand *Boolean,
                                      Operand *TrueValue, Operand *FalseValue) {
   Operand *_1 = legalize(Ctx->getConstantInt1(1), Legal_Reg | Legal_Flex);
@@ -6251,10 +6262,12 @@
 
 void TargetDataARM32::lowerGlobals(const VariableDeclarationList &Vars,
                                    const IceString &SectionSuffix) {
+  const bool IsPIC = Ctx->getFlags().getUseNonsfi();
   switch (Ctx->getFlags().getOutFileType()) {
   case FT_Elf: {
     ELFObjectWriter *Writer = Ctx->getObjectWriter();
-    Writer->writeDataSection(Vars, llvm::ELF::R_ARM_ABS32, SectionSuffix);
+    Writer->writeDataSection(Vars, llvm::ELF::R_ARM_ABS32, SectionSuffix,
+                             IsPIC);
   } break;
   case FT_Asm:
   case FT_Iasm: {
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index 5263bc2..2f342e0 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -57,9 +57,7 @@
   TargetARM32 &operator=(const TargetARM32 &) = delete;
 
 public:
-  ~TargetARM32() = default;
-
-  static void staticInit();
+  static void staticInit(const ClFlags &Flags);
   // TODO(jvoung): return a unique_ptr.
   static std::unique_ptr<::Ice::TargetLowering> create(Cfg *Func) {
     return makeUnique<TargetARM32>(Func);
@@ -130,12 +128,12 @@
 
   void emitVariable(const Variable *Var) const override;
 
-  const char *getConstantPrefix() const final { return "#"; }
   void emit(const ConstantUndef *C) const final;
   void emit(const ConstantInteger32 *C) const final;
   void emit(const ConstantInteger64 *C) const final;
   void emit(const ConstantFloat *C) const final;
   void emit(const ConstantDouble *C) const final;
+  void emit(const ConstantRelocatable *C) const final;
 
   void lowerArguments() override;
   void addProlog(CfgNode *Node) override;
@@ -156,12 +154,12 @@
                          /// immediates, shifted registers, or modified fp imm.
     Legal_Mem = 1 << 2,  /// includes [r0, r1 lsl #2] as well as [sp, #12]
     Legal_Rematerializable = 1 << 3,
-    Legal_All = ~Legal_Rematerializable,
+    Legal_Default = ~Legal_Rematerializable,
   };
 
   using LegalMask = uint32_t;
   Operand *legalizeUndef(Operand *From, int32_t RegNum = Variable::NoRegister);
-  Operand *legalize(Operand *From, LegalMask Allowed = Legal_All,
+  Operand *legalize(Operand *From, LegalMask Allowed = Legal_Default,
                     int32_t RegNum = Variable::NoRegister);
   Variable *legalizeToReg(Operand *From, int32_t RegNum = Variable::NoRegister);
 
diff --git a/src/IceTargetLoweringMIPS32.cpp b/src/IceTargetLoweringMIPS32.cpp
index 7ec0363..57f79f6 100644
--- a/src/IceTargetLoweringMIPS32.cpp
+++ b/src/IceTargetLoweringMIPS32.cpp
@@ -45,7 +45,9 @@
   return ::Ice::MIPS32::TargetHeaderMIPS32::create(Ctx);
 }
 
-void staticInit() { ::Ice::MIPS32::TargetMIPS32::staticInit(); }
+void staticInit(const ::Ice::ClFlags &Flags) {
+  ::Ice::MIPS32::TargetMIPS32::staticInit(Flags);
+}
 } // end of namespace MIPS32
 
 namespace Ice {
@@ -62,7 +64,8 @@
 
 TargetMIPS32::TargetMIPS32(Cfg *Func) : TargetLowering(Func) {}
 
-void TargetMIPS32::staticInit() {
+void TargetMIPS32::staticInit(const ClFlags &Flags) {
+  (void)Flags;
   llvm::SmallBitVector IntegerRegisters(RegMIPS32::Reg_NUM);
   llvm::SmallBitVector I64PairRegisters(RegMIPS32::Reg_NUM);
   llvm::SmallBitVector Float32Registers(RegMIPS32::Reg_NUM);
@@ -980,10 +983,12 @@
 
 void TargetDataMIPS32::lowerGlobals(const VariableDeclarationList &Vars,
                                     const IceString &SectionSuffix) {
+  const bool IsPIC = Ctx->getFlags().getUseNonsfi();
   switch (Ctx->getFlags().getOutFileType()) {
   case FT_Elf: {
     ELFObjectWriter *Writer = Ctx->getObjectWriter();
-    Writer->writeDataSection(Vars, llvm::ELF::R_MIPS_GLOB_DAT, SectionSuffix);
+    Writer->writeDataSection(Vars, llvm::ELF::R_MIPS_GLOB_DAT, SectionSuffix,
+                             IsPIC);
   } break;
   case FT_Asm:
   case FT_Iasm: {
diff --git a/src/IceTargetLoweringMIPS32.h b/src/IceTargetLoweringMIPS32.h
index 8a242bd..d3789f8 100644
--- a/src/IceTargetLoweringMIPS32.h
+++ b/src/IceTargetLoweringMIPS32.h
@@ -33,7 +33,7 @@
 public:
   ~TargetMIPS32() override = default;
 
-  static void staticInit();
+  static void staticInit(const ClFlags &Flags);
   static std::unique_ptr<::Ice::TargetLowering> create(Cfg *Func) {
     return makeUnique<TargetMIPS32>(Func);
   }
@@ -99,11 +99,6 @@
 
   void emitVariable(const Variable *Var) const override;
 
-  const char *getConstantPrefix() const final { return ""; }
-  void emit(const ConstantUndef *C) const final {
-    (void)C;
-    llvm::report_fatal_error("Not yet implemented");
-  }
   void emit(const ConstantInteger32 *C) const final {
     (void)C;
     llvm::report_fatal_error("Not yet implemented");
@@ -120,6 +115,14 @@
     (void)C;
     llvm::report_fatal_error("Not yet implemented");
   }
+  void emit(const ConstantUndef *C) const final {
+    (void)C;
+    llvm::report_fatal_error("Not yet implemented");
+  }
+  void emit(const ConstantRelocatable *C) const final {
+    (void)C;
+    llvm::report_fatal_error("Not yet implemented");
+  }
 
   // The following are helpers that insert lowered MIPS32 instructions with
   // minimal syntactic overhead, so that the lowering code can look as close to
@@ -193,10 +196,10 @@
     Legal_Reg = 1 << 0, // physical register, not stack location
     Legal_Imm = 1 << 1,
     Legal_Mem = 1 << 2,
-    Legal_All = ~Legal_None
+    Legal_Default = ~Legal_None
   };
   typedef uint32_t LegalMask;
-  Operand *legalize(Operand *From, LegalMask Allowed = Legal_All,
+  Operand *legalize(Operand *From, LegalMask Allowed = Legal_Default,
                     int32_t RegNum = Variable::NoRegister);
 
   Variable *legalizeToVar(Operand *From, int32_t RegNum = Variable::NoRegister);
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index b145948..8e78228 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -32,7 +32,9 @@
   return ::Ice::X8632::TargetHeaderX8632::create(Ctx);
 }
 
-void staticInit() { ::Ice::X8632::TargetX8632::staticInit(); }
+void staticInit(const ::Ice::ClFlags &Flags) {
+  ::Ice::X8632::TargetX8632::staticInit(Flags);
+}
 } // end of namespace X8632
 
 namespace Ice {
@@ -112,6 +114,14 @@
 llvm::SmallBitVector
     TargetX86Base<X8632::Traits>::ScratchRegs = llvm::SmallBitVector();
 
+template <>
+FixupKind TargetX86Base<X8632::Traits>::PcRelFixup =
+    TargetX86Base<X8632::Traits>::Traits::FK_PcRel;
+
+template <>
+FixupKind TargetX86Base<X8632::Traits>::AbsFixup =
+    TargetX86Base<X8632::Traits>::Traits::FK_Abs;
+
 //------------------------------------------------------------------------------
 //     __      ______  __     __  ______  ______  __  __   __  ______
 //    /\ \    /\  __ \/\ \  _ \ \/\  ___\/\  == \/\ \/\ "-.\ \/\  ___\
@@ -240,7 +250,8 @@
       break;
     }
   }
-  Operand *CallTarget = legalize(Instr->getCallTarget());
+  Operand *CallTarget =
+      legalize(Instr->getCallTarget(), Legal_Reg | Legal_Imm | Legal_AddrAbs);
   const bool NeedSandboxing = Ctx->getFlags().getUseSandboxing();
   if (NeedSandboxing) {
     if (llvm::isa<Constant>(CallTarget)) {
@@ -555,6 +566,43 @@
   if (!IsEbpBasedFrame)
     BasicFrameOffset += SpillAreaSizeBytes;
 
+  // If there is a non-deleted InstX86GetIP instruction, we need to move it to
+  // the point after the stack frame has stabilized but before
+  // register-allocated in-args are copied into their home registers.  It would
+  // be slightly faster to search for the GetIP instruction before other prolog
+  // instructions are inserted, but it's more clear to do the whole
+  // transformation in a single place.
+  Traits::Insts::GetIP *GetIPInst = nullptr;
+  if (Ctx->getFlags().getUseNonsfi()) {
+    for (Inst &Instr : Node->getInsts()) {
+      if (auto *GetIP = llvm::dyn_cast<Traits::Insts::GetIP>(&Instr)) {
+        if (!Instr.isDeleted())
+          GetIPInst = GetIP;
+        break;
+      }
+    }
+  }
+  // Delete any existing InstX86GetIP instruction and reinsert it here.  Also,
+  // insert the call to the helper function and the spill to the stack, to
+  // simplify emission.
+  if (GetIPInst) {
+    GetIPInst->setDeleted();
+    Variable *Dest = GetIPInst->getDest();
+    Variable *CallDest =
+        Dest->hasReg() ? Dest
+                       : getPhysicalRegister(Traits::RegisterSet::Reg_eax);
+    // Call the getIP_<reg> helper.
+    IceString RegName = Traits::getRegName(CallDest->getRegNum());
+    Constant *CallTarget = Ctx->getConstantExternSym(H_getIP_prefix + RegName);
+    Context.insert<Traits::Insts::Call>(CallDest, CallTarget);
+    // Insert a new version of InstX86GetIP.
+    Context.insert<Traits::Insts::GetIP>(CallDest);
+    // Spill the register to its home stack location if necessary.
+    if (!Dest->hasReg()) {
+      _mov(Dest, CallDest);
+    }
+  }
+
   const VarList &Args = Func->getArgs();
   size_t InArgsSizeBytes = 0;
   unsigned NumXmmArgs = 0;
@@ -695,8 +743,10 @@
   if (!BuildDefs::dump())
     return;
   Ostream &Str = Ctx->getStrEmit();
-  IceString MangledName = Ctx->mangleName(Func->getFunctionName());
-  Str << "\t.section\t.rodata." << MangledName
+  const bool UseNonsfi = Ctx->getFlags().getUseNonsfi();
+  const IceString MangledName = Ctx->mangleName(Func->getFunctionName());
+  const IceString Prefix = UseNonsfi ? ".data.rel.ro." : ".rodata.";
+  Str << "\t.section\t" << Prefix << MangledName
       << "$jumptable,\"a\",@progbits\n";
   Str << "\t.align\t" << typeWidthInBytes(getPointerType()) << "\n";
   Str << InstJumpTable::makeName(MangledName, JumpTable->getId()) << ":";
@@ -855,11 +905,12 @@
 }
 
 void TargetDataX8632::lowerJumpTables() {
+  const bool IsPIC = Ctx->getFlags().getUseNonsfi();
   switch (Ctx->getFlags().getOutFileType()) {
   case FT_Elf: {
     ELFObjectWriter *Writer = Ctx->getObjectWriter();
     for (const JumpTableData &JT : Ctx->getJumpTables())
-      Writer->writeJumpTable(JT, TargetX8632::Traits::RelFixup);
+      Writer->writeJumpTable(JT, TargetX8632::Traits::FK_Abs, IsPIC);
   } break;
   case FT_Asm:
     // Already emitted from Cfg
@@ -868,8 +919,9 @@
     if (!BuildDefs::dump())
       return;
     Ostream &Str = Ctx->getStrEmit();
+    const IceString Prefix = IsPIC ? ".data.rel.ro." : ".rodata.";
     for (const JumpTableData &JT : Ctx->getJumpTables()) {
-      Str << "\t.section\t.rodata." << JT.getFunctionName()
+      Str << "\t.section\t" << Prefix << JT.getFunctionName()
           << "$jumptable,\"a\",@progbits\n";
       Str << "\t.align\t" << typeWidthInBytes(getPointerType()) << "\n";
       Str << InstJumpTable::makeName(JT.getFunctionName(), JT.getId()) << ":";
@@ -885,11 +937,12 @@
 
 void TargetDataX8632::lowerGlobals(const VariableDeclarationList &Vars,
                                    const IceString &SectionSuffix) {
+  const bool IsPIC = Ctx->getFlags().getUseNonsfi();
   switch (Ctx->getFlags().getOutFileType()) {
   case FT_Elf: {
     ELFObjectWriter *Writer = Ctx->getObjectWriter();
-    Writer->writeDataSection(Vars, TargetX8632::Traits::RelFixup,
-                             SectionSuffix);
+    Writer->writeDataSection(Vars, TargetX8632::Traits::FK_Abs, SectionSuffix,
+                             IsPIC);
   } break;
   case FT_Asm:
   case FT_Iasm: {
diff --git a/src/IceTargetLoweringX8632Traits.h b/src/IceTargetLoweringX8632Traits.h
index dbd4565..2010ee4 100644
--- a/src/IceTargetLoweringX8632Traits.h
+++ b/src/IceTargetLoweringX8632Traits.h
@@ -72,8 +72,10 @@
   static const SizeT FramePtr = RegX8632::Reg_ebp;
   static const GPRRegister Encoded_Reg_Accumulator = RegX8632::Encoded_Reg_eax;
   static const GPRRegister Encoded_Reg_Counter = RegX8632::Encoded_Reg_ecx;
-  static const FixupKind PcRelFixup = llvm::ELF::R_386_PC32;
-  static const FixupKind RelFixup = llvm::ELF::R_386_32;
+  static constexpr FixupKind FK_PcRel = llvm::ELF::R_386_PC32;
+  static constexpr FixupKind FK_Abs = llvm::ELF::R_386_32;
+  static constexpr FixupKind FK_Gotoff = llvm::ELF::R_386_GOTOFF;
+  static constexpr FixupKind FK_GotPC = llvm::ELF::R_386_GOTPC;
 
   class Operand {
   public:
@@ -796,9 +798,10 @@
     static X86OperandMem *create(Cfg *Func, Type Ty, Variable *Base,
                                  Constant *Offset, Variable *Index = nullptr,
                                  uint16_t Shift = 0,
-                                 SegmentRegisters SegmentReg = DefaultSegment) {
-      return new (Func->allocate<X86OperandMem>())
-          X86OperandMem(Func, Ty, Base, Offset, Index, Shift, SegmentReg);
+                                 SegmentRegisters SegmentReg = DefaultSegment,
+                                 bool IsPIC = false) {
+      return new (Func->allocate<X86OperandMem>()) X86OperandMem(
+          Func, Ty, Base, Offset, Index, Shift, SegmentReg, IsPIC);
     }
     Variable *getBase() const { return Base; }
     Constant *getOffset() const { return Offset; }
@@ -806,6 +809,8 @@
     uint16_t getShift() const { return Shift; }
     SegmentRegisters getSegmentRegister() const { return SegmentReg; }
     void emitSegmentOverride(Assembler *Asm) const;
+    void setIsPIC() { IsPIC = true; }
+    bool getIsPIC() const { return IsPIC; }
     Address toAsmAddress(Assembler *Asm,
                          const Ice::TargetLowering *Target) const;
 
@@ -823,17 +828,19 @@
 
   private:
     X86OperandMem(Cfg *Func, Type Ty, Variable *Base, Constant *Offset,
-                  Variable *Index, uint16_t Shift, SegmentRegisters SegmentReg);
+                  Variable *Index, uint16_t Shift, SegmentRegisters SegmentReg,
+                  bool IsPIC);
 
     Variable *Base;
     Constant *Offset;
     Variable *Index;
     uint16_t Shift;
     SegmentRegisters SegmentReg : 16;
+    bool IsPIC;
     /// A flag to show if this memory operand is a randomized one. Randomized
     /// memory operands are generated in
     /// TargetX86Base::randomizeOrPoolImmediate()
-    bool Randomized;
+    bool Randomized = false;
   };
 
   /// VariableSplit is a way to treat an f64 memory location as a pair of i32
diff --git a/src/IceTargetLoweringX8664.cpp b/src/IceTargetLoweringX8664.cpp
index 2008afa..0616ba1 100644
--- a/src/IceTargetLoweringX8664.cpp
+++ b/src/IceTargetLoweringX8664.cpp
@@ -32,7 +32,9 @@
   return ::Ice::X8664::TargetHeaderX8664::create(Ctx);
 }
 
-void staticInit() { ::Ice::X8664::TargetX8664::staticInit(); }
+void staticInit(const ::Ice::ClFlags &Flags) {
+  ::Ice::X8664::TargetX8664::staticInit(Flags);
+}
 } // end of namespace X8664
 
 namespace Ice {
@@ -112,6 +114,14 @@
 llvm::SmallBitVector
     TargetX86Base<X8664::Traits>::ScratchRegs = llvm::SmallBitVector();
 
+template <>
+FixupKind TargetX86Base<X8664::Traits>::PcRelFixup =
+    TargetX86Base<X8664::Traits>::Traits::FK_PcRel;
+
+template <>
+FixupKind TargetX86Base<X8664::Traits>::AbsFixup =
+    TargetX86Base<X8664::Traits>::Traits::FK_Abs;
+
 //------------------------------------------------------------------------------
 //     __      ______  __     __  ______  ______  __  __   __  ______
 //    /\ \    /\  __ \/\ \  _ \ \/\  ___\/\  == \/\ \/\ "-.\ \/\  ___\
@@ -348,7 +358,7 @@
 
 void TargetX8664::lowerArguments() {
   VarList &Args = Func->getArgs();
-  // The first eight vetcor typed arguments (as well as fp arguments) are
+  // The first eight vector typed arguments (as well as fp arguments) are
   // passed in %xmm0 through %xmm7 regardless of their position in the argument
   // list.
   unsigned NumXmmArgs = 0;
@@ -890,11 +900,12 @@
 }
 
 void TargetDataX8664::lowerJumpTables() {
+  const bool IsPIC = Ctx->getFlags().getUseNonsfi();
   switch (Ctx->getFlags().getOutFileType()) {
   case FT_Elf: {
     ELFObjectWriter *Writer = Ctx->getObjectWriter();
     for (const JumpTableData &JumpTable : Ctx->getJumpTables())
-      Writer->writeJumpTable(JumpTable, TargetX8664::Traits::RelFixup);
+      Writer->writeJumpTable(JumpTable, TargetX8664::Traits::FK_Abs, IsPIC);
   } break;
   case FT_Asm:
     // Already emitted from Cfg
@@ -920,11 +931,12 @@
 
 void TargetDataX8664::lowerGlobals(const VariableDeclarationList &Vars,
                                    const IceString &SectionSuffix) {
+  const bool IsPIC = Ctx->getFlags().getUseNonsfi();
   switch (Ctx->getFlags().getOutFileType()) {
   case FT_Elf: {
     ELFObjectWriter *Writer = Ctx->getObjectWriter();
-    Writer->writeDataSection(Vars, TargetX8664::Traits::RelFixup,
-                             SectionSuffix);
+    Writer->writeDataSection(Vars, TargetX8664::Traits::FK_Abs, SectionSuffix,
+                             IsPIC);
   } break;
   case FT_Asm:
   case FT_Iasm: {
diff --git a/src/IceTargetLoweringX8664Traits.h b/src/IceTargetLoweringX8664Traits.h
index d64b607..314416c 100644
--- a/src/IceTargetLoweringX8664Traits.h
+++ b/src/IceTargetLoweringX8664Traits.h
@@ -72,8 +72,10 @@
   static const SizeT FramePtr = RegX8664::Reg_rbp;
   static const GPRRegister Encoded_Reg_Accumulator = RegX8664::Encoded_Reg_eax;
   static const GPRRegister Encoded_Reg_Counter = RegX8664::Encoded_Reg_ecx;
-  static const FixupKind PcRelFixup = llvm::ELF::R_X86_64_PC32;
-  static const FixupKind RelFixup = llvm::ELF::R_X86_64_32;
+  static constexpr FixupKind FK_PcRel = llvm::ELF::R_X86_64_PC32;
+  static constexpr FixupKind FK_Abs = llvm::ELF::R_X86_64_32;
+  static constexpr FixupKind FK_Gotoff = llvm::ELF::R_X86_64_GOTOFF64;
+  static constexpr FixupKind FK_GotPC = llvm::ELF::R_X86_64_GOTPC32;
 
   class Operand {
   public:
@@ -271,7 +273,7 @@
 
     static Address ofConstPool(Assembler *Asm, const Constant *Imm) {
       // TODO(jpp): ???
-      AssemblerFixup *Fixup = Asm->createFixup(RelFixup, Imm);
+      AssemblerFixup *Fixup = Asm->createFixup(FK_Abs, Imm);
       const RelocOffsetT Offset = 4;
       return Address(Offset, Fixup);
     }
@@ -839,11 +841,12 @@
     static X86OperandMem *
     create(Cfg *Func, Type Ty, Variable *Base, Constant *Offset,
            Variable *Index = nullptr, uint16_t Shift = 0,
-           SegmentRegisters SegmentRegister = DefaultSegment) {
+           SegmentRegisters SegmentRegister = DefaultSegment,
+           bool IsPIC = false) {
       assert(SegmentRegister == DefaultSegment);
       (void)SegmentRegister;
       return new (Func->allocate<X86OperandMem>())
-          X86OperandMem(Func, Ty, Base, Offset, Index, Shift);
+          X86OperandMem(Func, Ty, Base, Offset, Index, Shift, IsPIC);
     }
     Variable *getBase() const { return Base; }
     Constant *getOffset() const { return Offset; }
@@ -851,6 +854,8 @@
     uint16_t getShift() const { return Shift; }
     SegmentRegisters getSegmentRegister() const { return DefaultSegment; }
     void emitSegmentOverride(Assembler *) const {}
+    void setIsPIC() { IsPIC = true; }
+    bool getIsPIC() const { return IsPIC; }
     Address toAsmAddress(Assembler *Asm,
                          const Ice::TargetLowering *Target) const;
 
@@ -868,12 +873,13 @@
 
   private:
     X86OperandMem(Cfg *Func, Type Ty, Variable *Base, Constant *Offset,
-                  Variable *Index, uint16_t Shift);
+                  Variable *Index, uint16_t Shift, bool IsPIC);
 
     Variable *Base;
     Constant *Offset;
     Variable *Index;
     uint16_t Shift;
+    bool IsPIC;
     /// A flag to show if this memory operand is a randomized one. Randomized
     /// memory operands are generated in
     /// TargetX86Base::randomizeOrPoolImmediate()
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index 3c24cd3..d39bda0 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -79,9 +79,12 @@
 
   ~TargetX86Base() override = default;
 
-  static void staticInit();
+  static void staticInit(const ClFlags &Flags);
   static TargetX86Base *create(Cfg *Func) { return new TargetX86Base(Func); }
 
+  static FixupKind getPcRelFixup() { return PcRelFixup; }
+  static FixupKind getAbsFixup() { return AbsFixup; }
+
   void translateOm1() override;
   void translateO2() override;
   void doLoadOpt();
@@ -146,12 +149,12 @@
 
   void emitVariable(const Variable *Var) const override;
 
-  const char *getConstantPrefix() const final { return "$"; }
-  void emit(const ConstantUndef *C) const final;
   void emit(const ConstantInteger32 *C) const final;
   void emit(const ConstantInteger64 *C) const final;
   void emit(const ConstantFloat *C) const final;
   void emit(const ConstantDouble *C) const final;
+  void emit(const ConstantUndef *C) const final;
+  void emit(const ConstantRelocatable *C) const final;
 
   void initNodeForLowering(CfgNode *Node) override;
 
@@ -284,10 +287,12 @@
     Legal_Imm = 1 << 1,
     Legal_Mem = 1 << 2, // includes [eax+4*ecx] as well as [esp+12]
     Legal_Rematerializable = 1 << 3,
-    Legal_All = ~Legal_Rematerializable
+    Legal_AddrAbs = 1 << 4, // ConstantRelocatable doesn't have to add GotVar
+    Legal_Default = ~(Legal_Rematerializable | Legal_AddrAbs)
+    // TODO(stichnot): Figure out whether this default works for x86-64.
   };
   using LegalMask = uint32_t;
-  Operand *legalize(Operand *From, LegalMask Allowed = Legal_All,
+  Operand *legalize(Operand *From, LegalMask Allowed = Legal_Default,
                     int32_t RegNum = Variable::NoRegister);
   Variable *legalizeToReg(Operand *From, int32_t RegNum = Variable::NoRegister);
   /// Legalize the first source operand for use in the cmp instruction.
@@ -744,6 +749,9 @@
   static llvm::SmallBitVector ScratchRegs;
   llvm::SmallBitVector RegsUsed;
   std::array<VarList, IceType_NUM> PhysicalRegisters;
+  // GotVar is a Variable that holds the GlobalOffsetTable address for Non-SFI
+  // mode.
+  Variable *GotVar = nullptr;
 
   /// Randomize a given immediate operand
   Operand *randomizeOrPoolImmediate(Constant *Immediate,
@@ -811,6 +819,10 @@
   /// Optimizations for idiom recognition.
   bool lowerOptimizeFcmpSelect(const InstFcmp *Fcmp, const InstSelect *Select);
 
+  /// Emit code that initializes the value of the GotVar near the start of the
+  /// function.  (This code is emitted only in Non-SFI mode.)
+  void initGotVarIfNeeded();
+
   /// Complains loudly if invoked because the cpu can handle 64-bit types
   /// natively.
   template <typename T = Traits>
@@ -825,6 +837,9 @@
   lowerIcmp64(const InstIcmp *Icmp, const Inst *Consumer);
 
   BoolFolding FoldingInfo;
+
+  static FixupKind PcRelFixup;
+  static FixupKind AbsFixup;
 };
 } // end of namespace X86NAMESPACE
 } // end of namespace Ice
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index 04d9cdc..5127768 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -323,13 +323,19 @@
   }
 }
 
-template <typename TraitsType> void TargetX86Base<TraitsType>::staticInit() {
+template <typename TraitsType>
+void TargetX86Base<TraitsType>::staticInit(const ClFlags &Flags) {
   Traits::initRegisterSet(&TypeToRegisterSet, &RegisterAliases, &ScratchRegs);
+  PcRelFixup = Traits::FK_PcRel;
+  AbsFixup = Flags.getUseNonsfi() ? Traits::FK_Gotoff : Traits::FK_Abs;
 }
 
 template <typename TraitsType> void TargetX86Base<TraitsType>::translateO2() {
   TimerMarker T(TimerStack::TT_O2, Func);
 
+  if (!Traits::Is64Bit && Func->getContext()->getFlags().getUseNonsfi()) {
+    GotVar = Func->makeVariable(IceType_i32);
+  }
   genTargetHelperCalls();
   Func->dump("After target helper call insertion");
 
@@ -398,6 +404,7 @@
   Func->genCode();
   if (Func->hasError())
     return;
+  initGotVarIfNeeded();
   Func->dump("After x86 codegen");
 
   // Register allocation. This requires instruction renumbering and full
@@ -456,6 +463,9 @@
 template <typename TraitsType> void TargetX86Base<TraitsType>::translateOm1() {
   TimerMarker T(TimerStack::TT_Om1, Func);
 
+  if (!Traits::Is64Bit && Func->getContext()->getFlags().getUseNonsfi()) {
+    GotVar = Func->makeVariable(IceType_i32);
+  }
   genTargetHelperCalls();
 
   // Do not merge Alloca instructions, and lay out the stack.
@@ -478,6 +488,7 @@
   Func->genCode();
   if (Func->hasError())
     return;
+  initGotVarIfNeeded();
   Func->dump("After initial x8632 codegen");
 
   regAlloc(RAK_InfOnly);
@@ -803,7 +814,8 @@
     return;
   }
   if (Var->mustHaveReg()) {
-    llvm_unreachable("Infinite-weight Variable has no register assigned");
+    llvm::report_fatal_error(
+        "Infinite-weight Variable has no register assigned");
   }
   const int32_t Offset = Var->getStackOffset();
   int32_t BaseRegNum = Var->getBaseRegNum();
@@ -829,9 +841,10 @@
 typename TargetX86Base<TraitsType>::X86Address
 TargetX86Base<TraitsType>::stackVarToAsmOperand(const Variable *Var) const {
   if (Var->hasReg())
-    llvm_unreachable("Stack Variable has a register assigned");
+    llvm::report_fatal_error("Stack Variable has a register assigned");
   if (Var->mustHaveReg()) {
-    llvm_unreachable("Infinite-weight Variable has no register assigned");
+    llvm::report_fatal_error(
+        "Infinite-weight Variable has no register assigned");
   }
   int32_t Offset = Var->getStackOffset();
   int32_t BaseRegNum = Var->getBaseRegNum();
@@ -910,7 +923,7 @@
   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
     auto *MemOperand = X86OperandMem::create(
         Func, IceType_i32, Mem->getBase(), Mem->getOffset(), Mem->getIndex(),
-        Mem->getShift(), Mem->getSegmentRegister());
+        Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsPIC());
     // Test if we should randomize or pool the offset, if so randomize it or
     // pool it then create mem operand with the blinded/pooled constant.
     // Otherwise, return the mem operand as ordinary mem operand.
@@ -950,7 +963,7 @@
     }
     auto *MemOperand = X86OperandMem::create(
         Func, IceType_i32, Mem->getBase(), Offset, Mem->getIndex(),
-        Mem->getShift(), Mem->getSegmentRegister());
+        Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsPIC());
     // Test if the Offset is an eligible i32 constants for randomization and
     // pooling. Blind/pool it if it is. Otherwise return as oridinary mem
     // operand.
@@ -968,6 +981,23 @@
 }
 
 template <typename TraitsType>
+void TargetX86Base<TraitsType>::initGotVarIfNeeded() {
+  if (!Func->getContext()->getFlags().getUseNonsfi())
+    return;
+  if (Traits::Is64Bit) {
+    // Probably no implementation is needed, but error to be safe for now.
+    llvm::report_fatal_error(
+        "Need to implement initGotVarIfNeeded() for 64-bit.");
+  }
+  // Insert the GotVar assignment as the very first lowered instruction.  Later,
+  // it will be moved into the right place - after the stack frame is set up but
+  // before in-args are copied into registers.
+  Context.init(Func->getEntryNode());
+  Context.setInsertPoint(Context.getCur());
+  Context.insert<typename Traits::Insts::GetIP>(GotVar);
+}
+
+template <typename TraitsType>
 void TargetX86Base<TraitsType>::lowerAlloca(const InstAlloca *Inst) {
   // Conservatively require the stack to be aligned. Some stack adjustment
   // operations implemented below assume that the stack is aligned before the
@@ -3984,6 +4014,11 @@
 template <typename TraitsType>
 void TargetX86Base<TraitsType>::typedLoad(Type Ty, Variable *Dest,
                                           Variable *Base, Constant *Offset) {
+  // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
+  // legalize Mem properly.
+  if (Offset)
+    assert(!llvm::isa<ConstantRelocatable>(Offset));
+
   auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
 
   if (isVectorType(Ty))
@@ -3997,6 +4032,11 @@
 template <typename TraitsType>
 void TargetX86Base<TraitsType>::typedStore(Type Ty, Variable *Value,
                                            Variable *Base, Constant *Offset) {
+  // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
+  // legalize Mem properly.
+  if (Offset)
+    assert(!llvm::isa<ConstantRelocatable>(Offset));
+
   auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
 
   if (isVectorType(Ty))
@@ -4306,9 +4346,9 @@
       << ", Relocatable=" << Relocatable << "\n";
 }
 
-inline bool matchAssign(const VariablesMetadata *VMetadata, Variable *&Var,
-                        ConstantRelocatable *&Relocatable, int32_t &Offset,
-                        const Inst *&Reason) {
+inline bool matchAssign(const VariablesMetadata *VMetadata, Variable *GotVar,
+                        Variable *&Var, ConstantRelocatable *&Relocatable,
+                        int32_t &Offset, const Inst *&Reason) {
   // Var originates from Var=SrcVar ==> set Var:=SrcVar
   if (Var == nullptr)
     return false;
@@ -4335,7 +4375,7 @@
         return true;
       } else if (auto *AddReloc = llvm::dyn_cast<ConstantRelocatable>(SrcOp)) {
         if (Relocatable == nullptr) {
-          Var = nullptr;
+          Var = GotVar;
           Relocatable = AddReloc;
           Reason = VarAssign;
           return true;
@@ -4454,7 +4494,9 @@
   return false;
 }
 
-inline bool matchOffsetBase(const VariablesMetadata *VMetadata, Variable *&Base,
+inline bool matchOffsetBase(const VariablesMetadata *VMetadata,
+                            Variable *GotVar, Variable *&Base,
+                            Variable *&BaseOther,
                             ConstantRelocatable *&Relocatable, int32_t &Offset,
                             const Inst *&Reason) {
   // Base is Base=Var+Const || Base is Base=Const+Var ==>
@@ -4505,6 +4547,8 @@
       NewRelocatable = Reloc0;
     else if (Reloc1)
       NewRelocatable = Reloc1;
+    if ((Reloc0 || Reloc1) && BaseOther && GotVar)
+      return false;
     // Compute the updated constant offset.
     if (Const0) {
       int32_t MoreOffset = IsAdd ? Const0->getValue() : -Const0->getValue();
@@ -4520,6 +4564,10 @@
     }
     // Update the computed address parameters once we are sure optimization
     // is valid.
+    if ((Reloc0 || Reloc1) && GotVar) {
+      assert(BaseOther == nullptr);
+      BaseOther = GotVar;
+    }
     Base = NewBase;
     Offset = NewOffset;
     Relocatable = NewRelocatable;
@@ -4537,7 +4585,7 @@
 //   Base is a Variable,
 //   Index == nullptr,
 //   Shift == 0
-inline bool computeAddressOpt(Cfg *Func, const Inst *Instr,
+inline bool computeAddressOpt(Cfg *Func, const Inst *Instr, Variable *GotVar,
                               ConstantRelocatable *&Relocatable,
                               int32_t &Offset, Variable *&Base,
                               Variable *&Index, uint16_t &Shift) {
@@ -4568,7 +4616,7 @@
       Reason = nullptr;
     }
     // Update Base and Index to follow through assignments to definitions.
-    if (matchAssign(VMetadata, Base, Relocatable, Offset, Reason)) {
+    if (matchAssign(VMetadata, GotVar, Base, Relocatable, Offset, Reason)) {
       // Assignments of Base from a Relocatable or ConstantInt32 can result
       // in Base becoming nullptr.  To avoid code duplication in this loop we
       // prefer that Base be non-nullptr if possible.
@@ -4576,7 +4624,7 @@
         std::swap(Base, Index);
       continue;
     }
-    if (matchAssign(VMetadata, Index, Relocatable, Offset, Reason))
+    if (matchAssign(VMetadata, GotVar, Index, Relocatable, Offset, Reason))
       continue;
 
     if (!MockBounds) {
@@ -4605,10 +4653,11 @@
     // Update Offset to reflect additions/subtractions with constants and
     // relocatables.
     // TODO: consider overflow issues with respect to Offset.
-    if (matchOffsetBase(VMetadata, Base, Relocatable, Offset, Reason))
+    if (matchOffsetBase(VMetadata, GotVar, Base, Index, Relocatable, Offset,
+                        Reason))
       continue;
-    if (Shift == 0 &&
-        matchOffsetBase(VMetadata, Index, Relocatable, Offset, Reason))
+    if (Shift == 0 && matchOffsetBase(VMetadata, GotVar, Index, Base,
+                                      Relocatable, Offset, Reason))
       continue;
     // TODO(sehr, stichnot): Handle updates of Index with Shift != 0.
     // Index is Index=Var+Const ==>
@@ -4619,6 +4668,12 @@
     //   set Index=Var, Offset-=(Const<<Shift)
     break;
   } while (Reason);
+  // Undo any addition of GotVar.  It will be added back when the mem operand is
+  // legalized.
+  if (Base == GotVar)
+    Base = nullptr;
+  if (Index == GotVar)
+    Index = nullptr;
   return AddressWasOptimized;
 }
 
@@ -4683,10 +4738,9 @@
 template <typename TraitsType>
 void TargetX86Base<TraitsType>::lowerLoad(const InstLoad *Load) {
   // A Load instruction can be treated the same as an Assign instruction, after
-  // the source operand is transformed into an X86OperandMem operand.
-  // Note that the address mode optimization already creates an
-  // X86OperandMem operand, so it doesn't need another level of
-  // transformation.
+  // the source operand is transformed into an X86OperandMem operand.  Note that
+  // the address mode optimization already creates an X86OperandMem operand, so
+  // it doesn't need another level of transformation.
   Variable *DestLoad = Load->getDest();
   Type Ty = DestLoad->getType();
   Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty);
@@ -4708,9 +4762,10 @@
   // computeAddressOpt only works at the level of Variables and Constants, not
   // other X86OperandMem, so there should be no mention of segment
   // registers there either.
-  const SegmentRegisters SegmentReg = X86OperandMem::DefaultSegment;
+  constexpr auto SegmentReg = X86OperandMem::SegmentRegisters::DefaultSegment;
   auto *Base = llvm::dyn_cast<Variable>(Addr);
-  if (computeAddressOpt(Func, Inst, Relocatable, Offset, Base, Index, Shift)) {
+  if (computeAddressOpt(Func, Inst, GotVar, Relocatable, Offset, Base, Index,
+                        Shift)) {
     Inst->setDeleted();
     Constant *OffsetOp = nullptr;
     if (Relocatable == nullptr) {
@@ -4720,6 +4775,8 @@
                                      Relocatable->getName(),
                                      Relocatable->getSuppressMangling());
     }
+    // The new mem operand is created without IsPIC being set, because
+    // computeAddressOpt() doesn't include GotVar in its final result.
     Addr = X86OperandMem::create(Func, Dest->getType(), Base, OffsetOp, Index,
                                  Shift, SegmentReg);
     Context.insert<InstLoad>(Dest, Addr);
@@ -5011,8 +5068,9 @@
   // computeAddressOpt only works at the level of Variables and Constants, not
   // other X86OperandMem, so there should be no mention of segment
   // registers there either.
-  const SegmentRegisters SegmentReg = X86OperandMem::DefaultSegment;
-  if (computeAddressOpt(Func, Inst, Relocatable, Offset, Base, Index, Shift)) {
+  constexpr auto SegmentReg = X86OperandMem::SegmentRegisters::DefaultSegment;
+  if (computeAddressOpt(Func, Inst, GotVar, Relocatable, Offset, Base, Index,
+                        Shift)) {
     Inst->setDeleted();
     Constant *OffsetOp = nullptr;
     if (Relocatable == nullptr) {
@@ -5022,6 +5080,8 @@
                                      Relocatable->getName(),
                                      Relocatable->getSuppressMangling());
     }
+    // The new mem operand is created without IsPIC being set, because
+    // computeAddressOpt() doesn't include GotVar in its final result.
     Addr = X86OperandMem::create(Func, Data->getType(), Base, OffsetOp, Index,
                                  Shift, SegmentReg);
     auto *NewStore = Context.insert<InstStore>(Data, Addr);
@@ -5083,15 +5143,16 @@
 
     constexpr RelocOffsetT RelocOffset = 0;
     constexpr bool SuppressMangling = true;
+    const bool IsPIC = Ctx->getFlags().getUseNonsfi();
     IceString MangledName = Ctx->mangleName(Func->getFunctionName());
-    Constant *Base = Ctx->getConstantSym(
+    Variable *Base = IsPIC ? legalizeToReg(GotVar) : nullptr;
+    Constant *Offset = Ctx->getConstantSym(
         RelocOffset, InstJumpTable::makeName(MangledName, JumpTable->getId()),
         SuppressMangling);
-    Constant *Offset = nullptr;
     uint16_t Shift = typeWidthInBytesLog2(getPointerType());
-    // TODO(ascull): remove need for legalize by allowing null base in memop
+    constexpr auto Segment = X86OperandMem::SegmentRegisters::DefaultSegment;
     auto *TargetInMemory = X86OperandMem::create(
-        Func, getPointerType(), legalizeToReg(Base), Offset, Index, Shift);
+        Func, getPointerType(), Base, Offset, Index, Shift, Segment, IsPIC);
     Variable *Target = nullptr;
     _mov(Target, TargetInMemory);
     lowerIndirectJump(Target);
@@ -5417,8 +5478,31 @@
 
 /// Turn an i64 Phi instruction into a pair of i32 Phi instructions, to preserve
 /// integrity of liveness analysis. Undef values are also turned into zeroes,
-/// since loOperand() and hiOperand() don't expect Undef input.
+/// since loOperand() and hiOperand() don't expect Undef input.  Also, in
+/// Non-SFI mode, add a FakeUse(GotVar) for every pooled constant operand.
 template <typename TraitsType> void TargetX86Base<TraitsType>::prelowerPhis() {
+  if (Ctx->getFlags().getUseNonsfi()) {
+    assert(GotVar);
+    CfgNode *Node = Context.getNode();
+    uint32_t GotVarUseCount = 0;
+    for (Inst &I : Node->getPhis()) {
+      auto *Phi = llvm::dyn_cast<InstPhi>(&I);
+      if (Phi->isDeleted())
+        continue;
+      for (SizeT I = 0; I < Phi->getSrcSize(); ++I) {
+        Operand *Src = Phi->getSrc(I);
+        // TODO(stichnot): This over-counts for +0.0, and under-counts for other
+        // kinds of pooling.
+        if (llvm::isa<ConstantRelocatable>(Src) ||
+            llvm::isa<ConstantFloat>(Src) || llvm::isa<ConstantDouble>(Src)) {
+          ++GotVarUseCount;
+        }
+      }
+    }
+    if (GotVarUseCount) {
+      Node->getInsts().push_front(InstFakeUse::create(Func, GotVar));
+    }
+  }
   if (Traits::Is64Bit) {
     // On x86-64 we don't need to prelower phis -- the architecture can handle
     // 64-bit integer natively.
@@ -5901,7 +5985,8 @@
 template <typename TraitsType>
 Operand *TargetX86Base<TraitsType>::legalize(Operand *From, LegalMask Allowed,
                                              int32_t RegNum) {
-  Type Ty = From->getType();
+  const bool UseNonsfi = Func->getContext()->getFlags().getUseNonsfi();
+  const Type Ty = From->getType();
   // Assert that a physical register is allowed. To date, all calls to
   // legalize() allow a physical register. If a physical register needs to be
   // explicitly disallowed, then new code will need to be written to force a
@@ -5935,6 +6020,7 @@
     // Base and Index components are in physical registers.
     Variable *Base = Mem->getBase();
     Variable *Index = Mem->getIndex();
+    Constant *Offset = Mem->getOffset();
     Variable *RegBase = nullptr;
     Variable *RegIndex = nullptr;
     if (Base) {
@@ -5945,9 +6031,27 @@
       RegIndex = llvm::cast<Variable>(
           legalize(Index, Legal_Reg | Legal_Rematerializable));
     }
+    // For Non-SFI mode, if the Offset field is a ConstantRelocatable, we
+    // replace either Base or Index with a legalized GotVar.  At emission time,
+    // the ConstantRelocatable will be emitted with the @GOTOFF relocation.
+    bool NeedPIC = false;
+    if (UseNonsfi && !Mem->getIsPIC() && Offset &&
+        llvm::isa<ConstantRelocatable>(Offset)) {
+      assert(!(Allowed & Legal_AddrAbs));
+      NeedPIC = true;
+      if (RegBase == nullptr) {
+        RegBase = legalizeToReg(GotVar);
+      } else if (RegIndex == nullptr) {
+        RegIndex = legalizeToReg(GotVar);
+      } else {
+        llvm::report_fatal_error(
+            "Either Base or Index must be unused in Non-SFI mode");
+      }
+    }
     if (Base != RegBase || Index != RegIndex) {
-      Mem = X86OperandMem::create(Func, Ty, RegBase, Mem->getOffset(), RegIndex,
-                                  Mem->getShift(), Mem->getSegmentRegister());
+      Mem = X86OperandMem::create(Func, Ty, RegBase, Offset, RegIndex,
+                                  Mem->getShift(), Mem->getSegmentRegister(),
+                                  NeedPIC);
     }
 
     // For all Memory Operands, we do randomization/pooling here
@@ -5958,6 +6062,7 @@
     }
     return From;
   }
+
   if (auto *Const = llvm::dyn_cast<Constant>(From)) {
     if (llvm::isa<ConstantUndef>(Const)) {
       From = legalizeUndef(Const, RegNum);
@@ -5988,6 +6093,20 @@
       }
     }
 
+    // If the operand is a ConstantRelocatable, and Legal_AddrAbs is not
+    // specified, and UseNonsfi is indicated, we need to add GotVar.
+    if (auto *CR = llvm::dyn_cast<ConstantRelocatable>(Const)) {
+      if (UseNonsfi && !(Allowed & Legal_AddrAbs)) {
+        assert(Ty == IceType_i32);
+        Variable *RegBase = legalizeToReg(GotVar);
+        Variable *NewVar = makeReg(Ty, RegNum);
+        auto *Mem = Traits::X86OperandMem::create(Func, Ty, RegBase, CR);
+        Mem->setIsPIC();
+        _lea(NewVar, Mem);
+        From = NewVar;
+      }
+    }
+
     // Convert a scalar floating point constant into an explicit memory
     // operand.
     if (isScalarFloatingType(Ty)) {
@@ -5998,13 +6117,16 @@
         if (Utils::isPositiveZero(ConstDouble->getValue()))
           return makeZeroedRegister(Ty, RegNum);
       }
-      Variable *Base = nullptr;
+      Variable *Base = UseNonsfi ? legalizeToReg(GotVar) : nullptr;
       std::string Buffer;
       llvm::raw_string_ostream StrBuf(Buffer);
       llvm::cast<Constant>(From)->emitPoolLabel(StrBuf, Ctx);
       llvm::cast<Constant>(From)->setShouldBePooled(true);
       Constant *Offset = Ctx->getConstantSym(0, StrBuf.str(), true);
-      From = X86OperandMem::create(Func, Ty, Base, Offset);
+      auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
+      if (UseNonsfi)
+        Mem->setIsPIC();
+      From = Mem;
     }
     bool NeedsReg = false;
     if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty))
@@ -6018,6 +6140,7 @@
     }
     return From;
   }
+
   if (auto *Var = llvm::dyn_cast<Variable>(From)) {
     // Check if the variable is guaranteed a physical register. This can happen
     // either when the variable is pre-colored or when it is assigned infinite
@@ -6046,7 +6169,8 @@
     }
     return From;
   }
-  llvm_unreachable("Unhandled operand kind in legalize()");
+
+  llvm::report_fatal_error("Unhandled operand kind in legalize()");
   return From;
 }
 
@@ -6116,7 +6240,7 @@
       // offset, we will work on the whole memory operand later as one entity
       // later, this save one instruction. By turning blinding and pooling off,
       // we guarantee legalize(Offset) will return a Constant*.
-      {
+      if (!llvm::isa<ConstantRelocatable>(Offset)) {
         BoolFlagSaver B(RandomizationPoolingPaused, true);
 
         Offset = llvm::cast<Constant>(legalize(Offset));
@@ -6125,6 +6249,11 @@
       assert(llvm::isa<ConstantInteger32>(Offset) ||
              llvm::isa<ConstantRelocatable>(Offset));
     }
+    // Not completely sure whether it's OK to leave IsPIC unset when creating
+    // the mem operand.  If DoLegalize is true, it will definitely be applied
+    // during the legalize() call, but perhaps not during the
+    // randomizeOrPoolImmediate() call.  In any case, the emit routines will
+    // assert that PIC legalization has been applied.
     Mem = X86OperandMem::create(Func, Ty, Base, Offset);
   }
   // Do legalization, which contains randomization/pooling or do
@@ -6192,7 +6321,7 @@
   if (!BuildDefs::dump())
     return;
   Ostream &Str = Ctx->getStrEmit();
-  Str << getConstantPrefix() << C->getValue();
+  Str << "$" << C->getValue();
 }
 
 template <typename TraitsType>
@@ -6203,7 +6332,7 @@
     if (!BuildDefs::dump())
       return;
     Ostream &Str = Ctx->getStrEmit();
-    Str << getConstantPrefix() << C->getValue();
+    Str << "$" << C->getValue();
   }
 }
 
@@ -6228,6 +6357,16 @@
   llvm::report_fatal_error("undef value encountered by emitter.");
 }
 
+template <class Machine>
+void TargetX86Base<Machine>::emit(const ConstantRelocatable *C) const {
+  if (!BuildDefs::dump())
+    return;
+  assert(!Ctx->getFlags().getUseNonsfi());
+  Ostream &Str = Ctx->getStrEmit();
+  Str << "$";
+  emitWithoutPrefix(C);
+}
+
 /// Randomize or pool an Immediate.
 template <typename TraitsType>
 Operand *
@@ -6292,8 +6431,12 @@
       constexpr bool SuppressMangling = true;
       Constant *Symbol =
           Ctx->getConstantSym(Offset, Label_stream.str(), SuppressMangling);
+      const bool UseNonsfi = Ctx->getFlags().getUseNonsfi();
+      Variable *Base = UseNonsfi ? legalizeToReg(GotVar) : nullptr;
       X86OperandMem *MemOperand =
-          X86OperandMem::create(Func, Immediate->getType(), nullptr, Symbol);
+          X86OperandMem::create(Func, Immediate->getType(), Base, Symbol);
+      if (UseNonsfi)
+        MemOperand->setIsPIC();
       _mov(Reg, MemOperand);
       return Reg;
     }
@@ -6385,8 +6528,12 @@
         constexpr bool SuppressMangling = true;
         Constant *Symbol = Ctx->getConstantSym(SymOffset, Label_stream.str(),
                                                SuppressMangling);
+        const bool UseNonsfi = Ctx->getFlags().getUseNonsfi();
+        Variable *Base = UseNonsfi ? legalizeToReg(GotVar) : nullptr;
         X86OperandMem *SymbolOperand = X86OperandMem::create(
-            Func, MemOperand->getOffset()->getType(), nullptr, Symbol);
+            Func, MemOperand->getOffset()->getType(), Base, Symbol);
+        if (UseNonsfi)
+          SymbolOperand->setIsPIC();
         _mov(RegTemp, SymbolOperand);
         // If we have a base variable here, we should add the lea instruction
         // to add the value of the base variable to RegTemp. If there is no
diff --git a/tests_lit/llvm2ice_tests/adv-switch-opt.ll b/tests_lit/llvm2ice_tests/adv-switch-opt.ll
index fd3038b..664e66b 100644
--- a/tests_lit/llvm2ice_tests/adv-switch-opt.ll
+++ b/tests_lit/llvm2ice_tests/adv-switch-opt.ll
@@ -32,9 +32,8 @@
 ; CHECK: sub [[IND:[^,]+]],0x5b
 ; CHECK-NEXT: cmp [[IND]],0x8
 ; CHECK-NEXT: ja
-; CHECK-NEXT: mov [[BASE:[^,]+]],0x0 {{[0-9a-f]+}}: R_386_32 .{{.*}}testJumpTable$jumptable
-; CHECK-NEXT: mov {{.*}},DWORD PTR {{\[}}[[BASE]]+[[IND]]*4]
-; CHECK-NEXT: jmp
+; CHECK-NEXT: mov [[TARGET:.*]],DWORD PTR {{\[}}[[IND]]*4+0x0] {{[0-9a-f]+}}: R_386_32 .{{.*}}testJumpTable$jumptable
+; CHECK-NEXT: jmp [[TARGET]]
 
 ; Continuous ranges which map to the same target should be grouped and
 ; efficiently tested.
diff --git a/tests_lit/llvm2ice_tests/nonsfi.ll b/tests_lit/llvm2ice_tests/nonsfi.ll
new file mode 100644
index 0000000..bf40f16
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/nonsfi.ll
@@ -0,0 +1,115 @@
+; RUN: %p2i -i %s --filetype=obj --assemble --disassemble --args -O2 -nonsfi=1 \
+; RUN:   | FileCheck --check-prefix=NONSFI %s
+; RUN: %p2i -i %s --filetype=obj --assemble --disassemble --args -O2 -nonsfi=0 \
+; RUN:   | FileCheck --check-prefix=DEFAULT %s
+
+@G1 = internal global [4 x i8] zeroinitializer, align 4
+@G2 = internal global [4 x i8] zeroinitializer, align 4
+
+define internal void @testCallRegular() {
+entry:
+  call void @testCallRegular()
+  ret void
+}
+; Expect a simple direct call to testCallRegular.
+; NONSFI-LABEL: testCallRegular
+; NONSFI: call {{.*}} R_386_PC32 testCallRegular
+; DEFAULT-LABEL: testCallRegular
+
+define internal double @testCallBuiltin(double %val) {
+entry:
+  %result = frem double %val, %val
+  ret double %result
+}
+; Expect a simple direct call to fmod.
+; NONSFI-LABEL: testCallBuiltin
+; NONSFI: call {{.*}} R_386_PC32 fmod
+; DEFAULT-LABEL: testCallBuiltin
+
+define internal i32 @testLoadBasic() {
+entry:
+  %a = bitcast [4 x i8]* @G1 to i32*
+  %b = load i32, i32* %a, align 1
+  ret i32 %b
+}
+; Expect a load with a R_386_GOTOFF relocation.
+; NONSFI-LABEL: testLoadBasic
+; NONSFI: mov {{.*}} R_386_GOTOFF G1
+; DEFAULT-LABEL: testLoadBasic
+
+define internal i32 @testLoadFixedOffset() {
+entry:
+  %a = ptrtoint [4 x i8]* @G1 to i32
+  %a1 = add i32 %a, 4
+  %a2 = inttoptr i32 %a1 to i32*
+  %b = load i32, i32* %a2, align 1
+  ret i32 %b
+}
+; Expect a load with a R_386_GOTOFF relocation plus an immediate offset.
+; NONSFI-LABEL: testLoadFixedOffset
+; NONSFI: mov {{.*}}+0x4] {{.*}} R_386_GOTOFF G1
+; DEFAULT-LABEL: testLoadFixedOffset
+
+define internal i32 @testLoadIndexed(i32 %idx) {
+entry:
+  %a = ptrtoint [4 x i8]* @G1 to i32
+  %a0 = mul i32 %idx, 4
+  %a1 = add i32 %a0, 12
+  %a2 = add i32 %a1, %a
+  %a3 = inttoptr i32 %a2 to i32*
+  %b = load i32, i32* %a3, align 1
+  ret i32 %b
+}
+; Expect a load with a R_386_GOTOFF relocation plus an immediate offset, plus a
+; scaled index register.
+; NONSFI-LABEL: testLoadIndexed
+; NONSFI: mov {{.*}}*4+0xc] {{.*}} R_386_GOTOFF G1
+; DEFAULT-LABEL: testLoadIndexed
+
+define internal i32 @testLoadIndexedBase(i32 %base, i32 %idx) {
+entry:
+  %a = ptrtoint [4 x i8]* @G1 to i32
+  %a0 = mul i32 %idx, 4
+  %a1 = add i32 %a0, %base
+  %a2 = add i32 %a1, %a
+  %a3 = add i32 %a2, 12
+  %a4 = inttoptr i32 %a3 to i32*
+  %b = load i32, i32* %a4, align 1
+  ret i32 %b
+}
+; Expect a load with a R_386_GOTOFF relocation plus an immediate offset, but
+; without the scaled index.
+; NONSFI-LABEL: testLoadIndexedBase
+; NONSFI: mov {{.*}}*1+0xc] {{.*}} R_386_GOTOFF G1
+; By contrast, without -nonsfi, expect a load with a *R_386_32* relocation plus
+; an immediate offset, and *with* the scaled index.
+; DEFAULT-LABEL: testLoadIndexedBase
+; DEFAULT: mov {{.*}},DWORD PTR [{{.*}}+{{.*}}*4+0xc] {{.*}} R_386_32 G1
+
+define internal i32 @testLoadOpt() {
+entry:
+  %a = bitcast [4 x i8]* @G1 to i32*
+  %b = load i32, i32* %a, align 1
+  %c = bitcast [4 x i8]* @G2 to i32*
+  %d = load i32, i32* %c, align 1
+  %e = add i32 %b, %d
+  ret i32 %e
+}
+; Expect a load-folding optimization with a R_386_GOTOFF relocation.
+; NONSFI-LABEL: testLoadOpt
+; NONSFI: mov [[REG:e..]],{{.*}}+0x0] {{.*}} R_386_GOTOFF G1
+; NONSFI-NEXT: add [[REG]],{{.*}}+0x0] {{.*}} R_386_GOTOFF G2
+; DEFAULT-LABEL: testLoadOpt
+
+define internal void @testRMW() {
+entry:
+  %a = bitcast [4 x i8]* @G1 to i32*
+  %b = load i32, i32* %a, align 1
+  %c = add i32 %b, 1234
+  store i32 %c, i32* %a, align 1
+  ret void
+}
+; Expect an RMW optimization with a R_386_GOTOFF relocation.
+; NONSFI-LABEL: testRMW
+; NONSFI: add DWORD PTR {{.*}}+0x0],0x4d2 {{.*}} R_386_GOTOFF G1
+; DEFAULT-LABEL: testRMW