Local Analysis & Optimization

reminder hw1 due on Friday- I expected some questions?

and anyone who forgot hw0, still needed!

llvm control flow graph

commands to draw a dot list of a c file from llvm

 clang -S -emit-llvm images/identity.c -o foo.ll 
 opt -dot-cfg foo.ll -disable-output -enable-new-pm=0
void identity(int **a, int N)
{
    int i, j;
    for (i = 0; i < N; i++)
    {
        for (j = 0; j < N; j++)
        {
            a[i][j] = 0;
        }
    }
    for (i = 0; i < N; i++)
    {
        a[i][i] = 1;
    }
}
digraph "CFG for 'identity' function" {
    label="CFG for 'identity' function";

    Node0x12c5490 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#88abfd70",label="{%2:\l  %3 = alloca i32**, align 8\l  %4 = alloca i32, align 4\l  %5 = alloca i32, align 4\l  %6 = alloca i32, align 4\l  store i32** %0, i32*** %3, align 8\l  store i32 %1, i32* %4, align 4\l  store i32 0, i32* %5, align 4\l  br label %7\l}"];
    Node0x12c5490 -> Node0x12c5da0;
    Node0x12c5da0 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#f3c7b170",label="{%7:\l7:                                                \l  %8 = load i32, i32* %5, align 4\l  %9 = load i32, i32* %4, align 4\l  %10 = icmp slt i32 %8, %9\l  br i1 %10, label %11, label %32\l|{<s0>T|<s1>F}}"];
    Node0x12c5da0:s0 -> Node0x12c5c70;
    Node0x12c5da0:s1 -> Node0x12c5f40;
    Node0x12c5c70 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#f3c7b170",label="{%11:\l11:                                               \l  store i32 0, i32* %6, align 4\l  br label %12\l}"];
    Node0x12c5c70 -> Node0x12c6080;
    Node0x12c6080 [shape=record,color="#b70d28ff", style=filled, fillcolor="#b70d2870",label="{%12:\l12:                                               \l  %13 = load i32, i32* %6, align 4\l  %14 = load i32, i32* %4, align 4\l  %15 = icmp slt i32 %13, %14\l  br i1 %15, label %16, label %28\l|{<s0>T|<s1>F}}"];
    Node0x12c6080:s0 -> Node0x12c62b0;
    Node0x12c6080:s1 -> Node0x12c6300;
    Node0x12c62b0 [shape=record,color="#b70d28ff", style=filled, fillcolor="#b70d2870",label="{%16:\l16:                                               \l  %17 = load i32**, i32*** %3, align 8\l  %18 = load i32, i32* %5, align 4\l  %19 = sext i32 %18 to i64\l  %20 = getelementptr inbounds i32*, i32** %17, i64 %19\l  %21 = load i32*, i32** %20, align 8\l  %22 = load i32, i32* %6, align 4\l  %23 = sext i32 %22 to i64\l  %24 = getelementptr inbounds i32, i32* %21, i64 %23\l  store i32 0, i32* %24, align 4\l  br label %25\l}"];
    Node0x12c62b0 -> Node0x12c6820;
    Node0x12c6820 [shape=record,color="#b70d28ff", style=filled, fillcolor="#b70d2870",label="{%25:\l25:                                               \l  %26 = load i32, i32* %6, align 4\l  %27 = add nsw i32 %26, 1\l  store i32 %27, i32* %6, align 4\l  br label %12, !llvm.loop !6\l}"];
    Node0x12c6820 -> Node0x12c6080;
    Node0x12c6300 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#f3c7b170",label="{%28:\l28:                                               \l  br label %29\l}"];
    Node0x12c6300 -> Node0x12c75b0;
    Node0x12c75b0 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#f3c7b170",label="{%29:\l29:                                               \l  %30 = load i32, i32* %5, align 4\l  %31 = add nsw i32 %30, 1\l  store i32 %31, i32* %5, align 4\l  br label %7, !llvm.loop !8\l}"];
    Node0x12c75b0 -> Node0x12c5da0;
    Node0x12c5f40 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#88abfd70",label="{%32:\l32:                                               \l  store i32 0, i32* %5, align 4\l  br label %33\l}"];
    Node0x12c5f40 -> Node0x12c7bd0;
    Node0x12c7bd0 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#f3c7b170",label="{%33:\l33:                                               \l  %34 = load i32, i32* %5, align 4\l  %35 = load i32, i32* %4, align 4\l  %36 = icmp slt i32 %34, %35\l  br i1 %36, label %37, label %49\l|{<s0>T|<s1>F}}"];
    Node0x12c7bd0:s0 -> Node0x12c7e00;
    Node0x12c7bd0:s1 -> Node0x12c7e50;
    Node0x12c7e00 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#f3c7b170",label="{%37:\l37:                                               \l  %38 = load i32**, i32*** %3, align 8\l  %39 = load i32, i32* %5, align 4\l  %40 = sext i32 %39 to i64\l  %41 = getelementptr inbounds i32*, i32** %38, i64 %40\l  %42 = load i32*, i32** %41, align 8\l  %43 = load i32, i32* %5, align 4\l  %44 = sext i32 %43 to i64\l  %45 = getelementptr inbounds i32, i32* %42, i64 %44\l  store i32 1, i32* %45, align 4\l  br label %46\l}"];
    Node0x12c7e00 -> Node0x12c8400;
    Node0x12c8400 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#f3c7b170",label="{%46:\l46:                                               \l  %47 = load i32, i32* %5, align 4\l  %48 = add nsw i32 %47, 1\l  store i32 %48, i32* %5, align 4\l  br label %33, !llvm.loop !9\l}"];
    Node0x12c8400 -> Node0x12c7bd0;
    Node0x12c7e50 [shape=record,color="#3d50c3ff", style=filled, fillcolor="#88abfd70",label="{%49:\l49:                                               \l  ret void\l}"];
}

CFG for 'identity' function CFG for 'identity' function Node0x12c5490 %2: %3 = alloca i32**, align 8 %4 = alloca i32, align 4 %5 = alloca i32, align 4 %6 = alloca i32, align 4 store i32** %0, i32*** %3, align 8 store i32 %1, i32* %4, align 4 store i32 0, i32* %5, align 4 br label %7 Node0x12c5da0 %7: 7: %8 = load i32, i32* %5, align 4 %9 = load i32, i32* %4, align 4 %10 = icmp slt i32 %8, %9 br i1 %10, label %11, label %32 T F Node0x12c5490->Node0x12c5da0 Node0x12c5c70 %11: 11: store i32 0, i32* %6, align 4 br label %12 Node0x12c5da0:s0->Node0x12c5c70 Node0x12c5f40 %32: 32: store i32 0, i32* %5, align 4 br label %33 Node0x12c5da0:s1->Node0x12c5f40 Node0x12c6080 %12: 12: %13 = load i32, i32* %6, align 4 %14 = load i32, i32* %4, align 4 %15 = icmp slt i32 %13, %14 br i1 %15, label %16, label %28 T F Node0x12c5c70->Node0x12c6080 Node0x12c7bd0 %33: 33: %34 = load i32, i32* %5, align 4 %35 = load i32, i32* %4, align 4 %36 = icmp slt i32 %34, %35 br i1 %36, label %37, label %49 T F Node0x12c5f40->Node0x12c7bd0 Node0x12c62b0 %16: 16: %17 = load i32**, i32*** %3, align 8 %18 = load i32, i32* %5, align 4 %19 = sext i32 %18 to i64 %20 = getelementptr inbounds i32*, i32** %17, i64 %19 %21 = load i32*, i32** %20, align 8 %22 = load i32, i32* %6, align 4 %23 = sext i32 %22 to i64 %24 = getelementptr inbounds i32, i32* %21, i64 %23 store i32 0, i32* %24, align 4 br label %25 Node0x12c6080:s0->Node0x12c62b0 Node0x12c6300 %28: 28: br label %29 Node0x12c6080:s1->Node0x12c6300 Node0x12c6820 %25: 25: %26 = load i32, i32* %6, align 4 %27 = add nsw i32 %26, 1 store i32 %27, i32* %6, align 4 br label %12, !llvm.loop !6 Node0x12c62b0->Node0x12c6820 Node0x12c75b0 %29: 29: %30 = load i32, i32* %5, align 4 %31 = add nsw i32 %30, 1 store i32 %31, i32* %5, align 4 br label %7, !llvm.loop !8 Node0x12c6300->Node0x12c75b0 Node0x12c6820->Node0x12c6080 Node0x12c75b0->Node0x12c5da0 Node0x12c7e00 %37: 37: %38 = load i32**, i32*** %3, align 8 %39 = load i32, i32* %5, align 4 %40 = sext i32 %39 to i64 %41 = getelementptr inbounds i32*, i32** %38, i64 %40 %42 = load i32*, i32** %41, align 8 %43 = load i32, i32* %5, align 4 %44 = sext i32 %43 to i64 %45 = getelementptr inbounds i32, i32* %42, i64 %44 store i32 1, i32* %45, align 4 br label %46 Node0x12c7bd0:s0->Node0x12c7e00 Node0x12c7e50 %49: 49: ret void Node0x12c7bd0:s1->Node0x12c7e50 Node0x12c8400 %46: 46: %47 = load i32, i32* %5, align 4 %48 = add nsw i32 %47, 1 store i32 %48, i32* %5, align 4 br label %33, !llvm.loop !9 Node0x12c7e00->Node0x12c8400 Node0x12c8400->Node0x12c7bd0

flavors of optimization

I want to separate 3 flavors of optimization.

  1. local meaning within one basic block
  2. global meaning within one function (not really global)
  3. inter-procedural over the entire program

Usually an optimization takes time that is more then linear in some property, For example a local optimization might take time \(n^2\) in the number of instructions in the block. a global optimization might take much longer, and an inter-procedural longer still. To keep compile time reasonable many compilers limit the number of global optimizations and skip inter-procedural optimizations. As a consequence many more optimizations get published but not used in production.


When would running an optimization speedup compilation?

For a local optimization, instructions within a block are ordered, so it makes sense to talk about instructions coming before or after others.

For a global optimization, two instructions are ordered by a path from one block to another and different paths through the program give different orders.


One special case is JIT (just in time) compilers, where programs get compiled at the start of execution. GPU compilers (and java compilers) look like this. They may use run-time information to decide of recompiling a function is a good idea. This is called Hotspot compiling. Some JIT compilers use hot/cold compiling, where they only run the fancy compiler on basic blocks that are hot , i.e., execute a lot.

%%{init: {"flowchart": {"htmlLabels": false}} }%%
flowchart LR
A[application] -- offline --> B[byte code/ptx]
B --> C[quick run time compiler/ finalizer]
C --> D[isa]
B --> C1[fancy compiler - only run on long running functions];
C1 --> D;

%%{init: {"flowchart": {"htmlLabels": false}} }%%
flowchart LR
A[application] -- offline --> B[byte code/ptx]
B --> C[quick run time compiler/ finalizer]
C --> D[isa]
B --> C1[fancy compiler - only run on long running functions];
C1 --> D;


We are going to consider several versions of trivial dead code elimination. Trivial because we are going to hold off on control flow related optimizations till later. Sometimes people call this DCE or trivial DCE.


For each case, we start by defining what we mean by dead code.

example 1

@main {
  a: int = const 4;
  b: int = const 2;
  c: int = const 1;
  d: int = add a b;
  print d;
}

What instruction is dead? (meaning get the same answer if we delete the instruction) What is your definition? Is this meaning of dead code local or global?


Why would you ever have dead code in a program? One reason is that have DCE as a separate pass means other optimizations do not have to clean up.

Definition 1- Dead if instruction writes a variable and the variable is never used.

An instruction that has side-effects, like a print statement does not write a variable so it never gets deleted. Labels do not write a variable so they do not get deleted as well.


What is the pseudo code to find dead instructions using this definition?

. . .

used = empty set 
for instr in func 
   used += instr.args 
for instd in func
    if instr has a dest and dest in not in used 
       delete instr

example 2

@main {
  a: int = const 4;
  b: int = const 2;
  c: int = const 1;  
  d: int = add a b;
  e: int = add c d; 
  print d;
}

. . .

The code so far only deletes one instruction, but we would like to get rid of two. Instruction c should also be dead. How do we change the definition

Definition 2- Dead if instruction writes a variable and the variable is either never used or only used in dead instructions.

iterating till convergence

while changes:
       run one pass of tdce above

what would be faster? What is some pseudo code for the change

. . .

  find all the variables that are used in more then one block
  for each block b 
     used = all variables used in more then one block
     walk backwards over the instruction in the block
     for each instruction is dest in used?
        yes - remove dest from used, add arguments to used 
        no  - instruction is dead 

finding all the variables used in more then one block might be expensive


example 3

@main {
  a: int = const 4;
  a: int = const 200;
  print a;
}

Definition? An instruction is dead if that instruction writes a variable v and no path starting at that instruction reaches a use of v

this talks about paths (control flow paths)

@main {
  a: int = const 4;
     br input .then .else 
  .then
  a: int = const 200;
  .else 
  print a;
}

for now we want to skip control flow

Definition: An instruction is dead if that instruction writes a variable v and no path within the block starting at that instruction reaches a use of v in the same block or reaches the exit of the block


cands are the variables that are defined but not used 
last_def = {}  variables -> instructions 
this is a mapping variables that have been defined but not used

   for instr in block:
      each arg (use) removes arg from last def 
      if the instr has a dest 
          if the dest is in last_def, 
      add dest->instr to last def
  

and as you might expect, we need to iterate this till convergence


Compilers often run dce more then once- why?


testing out dce

  1. program should get the same answer

  2. program should run less instructions


Some test cases:

  1. simple.bril,

  2. reassign.bril,

  3. other examples in the DCE test directory

testing

bril2json < bench.bril | python3 tdce.py | bril2txt

Next, try using wc to check static code size differences:

bril2json < bench.bril | wc -l

bril2json < bench.bril | python3 tdce.py | wc -l

Then profiling to measure dynamic instruction count: The bril interpreter has a flag -p which prints the number of dynamically executed instructions.

How good a measure is this for real programs?

test with profile

bril2json < bench.bril | brili -p

bril2json < bench.bril | python3 tdce.py | brili -p

using trunt (golden images)

  1. Configure. Decide what command you want to test. Make a turnt.toml config file and put command = “mycmd {filename}” in it to pass each test file as an argument to mycmd.

  2. Take a snapshot. Run turnt –save foo.bril. Execute mycmd foo.bril and save the standard output into foo.out.

You might want to take a look at this output to make sure it’s what you expect

  1. Test your work. Now that you have a test in place, keep working. Use turnt *.bril to run all your tests and confirm that the output still matches.

If there’s a mismatch, you can do turnt –diff to see the changes.

peephole optimizations

1.Peephole optimizations are a category of local code optimizations.
1. The principle is very simple: a. the optimizer analyzes sequences of instructions. –  a. only code that is within a small window of instructions is analyzed each time.
a. this window slides over the code. a. once patterns are discovered inside this window, optimizations are applied.

some examples

redundant loads and stores

m = load r0 store m in r0

branch transformations

’’’ if debug ==1 go to l1 go to l2 l1: l2:

transforms to 

if debug !=1 goto l2 l1: l2: ```

reduction in strength

4*x => x << 2

special machine idioms

Back to top