| #!/usr/bin/python |
| # |
| #Bertrone Matteo - Polytechnic of Turin |
| #November 2015 |
| # |
| #eBPF application that parses HTTP packets |
| #and extracts (and prints on screen) the URL contained in the GET/POST request. |
| # |
| #eBPF program http_filter is used as SOCKET_FILTER attached to eth0 interface. |
| #only packet of type ip and tcp containing HTTP GET/POST are returned to userspace, others dropped |
| # |
| #python script uses bcc BPF Compiler Collection by iovisor (https://github.com/iovisor/bcc) |
| #and prints on stdout the first line of the HTTP GET/POST request containing the url |
| |
| from __future__ import print_function |
| from bcc import BPF |
| from ctypes import * |
| from struct import * |
| |
| import sys |
| import socket |
| import os |
| import struct |
| import binascii |
| import time |
| |
| CLEANUP_N_PACKETS = 50 #run cleanup every CLEANUP_N_PACKETS packets received |
| MAX_URL_STRING_LEN = 8192 #max url string len (usually 8K) |
| MAX_AGE_SECONDS = 30 #max age entry in bpf_sessions map |
| #-----FUNCTIONS-BEGIN----------------------# |
| |
| #convert a bin string into a string of hex char |
| #helper function to print raw packet in hex |
| def toHex(s): |
| lst = [] |
| for ch in s: |
| hv = hex(ord(ch)).replace('0x', '') |
| if len(hv) == 1: |
| hv = '0'+hv |
| lst.append(hv) |
| |
| return reduce(lambda x,y:x+y, lst) |
| |
| #print str until CR+LF |
| def printUntilCRLF(str): |
| for k in range (0,len(str)-1): |
| if (str[k] == '\n'): |
| if (str[k-1] == '\r'): |
| print ("") |
| return |
| print ("%c" % (str[k]), end = "") |
| print("") |
| return |
| |
| #cleanup function |
| def cleanup(): |
| #get current time in seconds |
| current_time = int(time.time()) |
| #looking for leaf having: |
| #timestap == 0 --> update with current timestamp |
| #AGE > MAX_AGE_SECONDS --> delete item |
| for key,leaf in bpf_sessions.items(): |
| try: |
| current_leaf = bpf_sessions[key] |
| #set timestamp if timestamp == 0 |
| if (current_leaf.timestamp == 0): |
| bpf_sessions[key] = bpf_sessions.Leaf(current_time) |
| else: |
| #delete older entries |
| if (current_time - current_leaf.timestamp > MAX_AGE_SECONDS): |
| del bpf_sessions[key] |
| except: |
| print("cleanup exception.") |
| return |
| |
| #-----FUNCTIONS-END-------------------------# |
| |
| |
| # initialize BPF - load source code from http-parse-complete.c |
| bpf = BPF(src_file = "http-parse-complete.c",debug = 0) |
| |
| #load eBPF program http_filter of type SOCKET_FILTER into the kernel eBPF vm |
| #more info about eBPF program types |
| #http://man7.org/linux/man-pages/man2/bpf.2.html |
| function_http_filter = bpf.load_func("http_filter", BPF.SOCKET_FILTER) |
| |
| #create raw socket, bind it to eth0 |
| #attach bpf program to socket created |
| BPF.attach_raw_socket(function_http_filter, "eth0") |
| |
| #get file descriptor of the socket previously created inside BPF.attach_raw_socket |
| socket_fd = function_http_filter.sock |
| |
| #create python socket object, from the file descriptor |
| sock = socket.fromfd(socket_fd,socket.PF_PACKET,socket.SOCK_RAW,socket.IPPROTO_IP) |
| #set it as blocking socket |
| sock.setblocking(True) |
| |
| #get pointer to bpf map of type hash |
| bpf_sessions = bpf.get_table("sessions") |
| |
| #packets counter |
| packet_count = 0 |
| |
| #dictionary containing association <key(ipsrc,ipdst,portsrc,portdst),payload_string> |
| #if url is not entirely contained in only one packet, save the firt part of it in this local dict |
| #when I find \r\n in a next pkt, append and print all the url |
| local_dictionary = {} |
| |
| while 1: |
| #retrieve raw packet from socket |
| packet_str = os.read(socket_fd,4096) #set packet length to max packet length on the interface |
| packet_count += 1 |
| |
| #DEBUG - print raw packet in hex format |
| #packet_hex = toHex(packet_str) |
| #print ("%s" % packet_hex) |
| |
| #convert packet into bytearray |
| packet_bytearray = bytearray(packet_str) |
| |
| #ethernet header length |
| ETH_HLEN = 14 |
| |
| #IP HEADER |
| #https://tools.ietf.org/html/rfc791 |
| # 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 |
| # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
| # |Version| IHL |Type of Service| Total Length | |
| # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
| # |
| #IHL : Internet Header Length is the length of the internet header |
| #value to multiply * 4 byte |
| #e.g. IHL = 5 ; IP Header Length = 5 * 4 byte = 20 byte |
| # |
| #Total length: This 16-bit field defines the entire packet size, |
| #including header and data, in bytes. |
| |
| #calculate packet total length |
| total_length = packet_bytearray[ETH_HLEN + 2] #load MSB |
| total_length = total_length << 8 #shift MSB |
| total_length = total_length + packet_bytearray[ETH_HLEN+3] #add LSB |
| |
| #calculate ip header length |
| ip_header_length = packet_bytearray[ETH_HLEN] #load Byte |
| ip_header_length = ip_header_length & 0x0F #mask bits 0..3 |
| ip_header_length = ip_header_length << 2 #shift to obtain length |
| |
| #retrieve ip source/dest |
| ip_src_str = packet_str[ETH_HLEN+12:ETH_HLEN+16] #ip source offset 12..15 |
| ip_dst_str = packet_str[ETH_HLEN+16:ETH_HLEN+20] #ip dest offset 16..19 |
| |
| ip_src = int(toHex(ip_src_str),16) |
| ip_dst = int(toHex(ip_dst_str),16) |
| |
| #TCP HEADER |
| #https://www.rfc-editor.org/rfc/rfc793.txt |
| # 12 13 14 15 |
| # 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 |
| # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
| # | Data | |U|A|P|R|S|F| | |
| # | Offset| Reserved |R|C|S|S|Y|I| Window | |
| # | | |G|K|H|T|N|N| | |
| # +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
| # |
| #Data Offset: This indicates where the data begins. |
| #The TCP header is an integral number of 32 bits long. |
| #value to multiply * 4 byte |
| #e.g. DataOffset = 5 ; TCP Header Length = 5 * 4 byte = 20 byte |
| |
| #calculate tcp header length |
| tcp_header_length = packet_bytearray[ETH_HLEN + ip_header_length + 12] #load Byte |
| tcp_header_length = tcp_header_length & 0xF0 #mask bit 4..7 |
| tcp_header_length = tcp_header_length >> 2 #SHR 4 ; SHL 2 -> SHR 2 |
| |
| #retrieve port source/dest |
| port_src_str = packet_str[ETH_HLEN+ip_header_length:ETH_HLEN+ip_header_length+2] |
| port_dst_str = packet_str[ETH_HLEN+ip_header_length+2:ETH_HLEN+ip_header_length+4] |
| |
| port_src = int(toHex(port_src_str),16) |
| port_dst = int(toHex(port_dst_str),16) |
| |
| #calculate payload offset |
| payload_offset = ETH_HLEN + ip_header_length + tcp_header_length |
| |
| #payload_string contains only packet payload |
| payload_string = packet_str[(payload_offset):(len(packet_bytearray))] |
| |
| #CR + LF (substring to find) |
| crlf = "\r\n" |
| |
| #current_Key contains ip source/dest and port source/map |
| #useful for direct bpf_sessions map access |
| current_Key = bpf_sessions.Key(ip_src,ip_dst,port_src,port_dst) |
| |
| #looking for HTTP GET/POST request |
| if ((payload_string[:3] == "GET") or (payload_string[:4] == "POST") or (payload_string[:4] == "HTTP") \ |
| or ( payload_string[:3] == "PUT") or (payload_string[:6] == "DELETE") or (payload_string[:4] == "HEAD") ): |
| #match: HTTP GET/POST packet found |
| if (crlf in payload_string): |
| #url entirely contained in first packet -> print it all |
| printUntilCRLF(payload_string) |
| |
| #delete current_Key from bpf_sessions, url already printed. current session not useful anymore |
| try: |
| del bpf_sessions[current_Key] |
| except: |
| print ("error during delete from bpf map ") |
| else: |
| #url NOT entirely contained in first packet |
| #not found \r\n in payload. |
| #save current part of the payload_string in dictionary <key(ips,ipd,ports,portd),payload_string> |
| local_dictionary[binascii.hexlify(current_Key)] = payload_string |
| else: |
| #NO match: HTTP GET/POST NOT found |
| |
| #check if the packet belong to a session saved in bpf_sessions |
| if (current_Key in bpf_sessions): |
| #check id the packet belong to a session saved in local_dictionary |
| #(local_dictionary mantains HTTP GET/POST url not printed yet because splitted in N packets) |
| if (binascii.hexlify(current_Key) in local_dictionary): |
| #first part of the HTTP GET/POST url is already present in local dictionary (prev_payload_string) |
| prev_payload_string = local_dictionary[binascii.hexlify(current_Key)] |
| #looking for CR+LF in current packet. |
| if (crlf in payload_string): |
| #last packet. containing last part of HTTP GET/POST url splitted in N packets. |
| #append current payload |
| prev_payload_string += payload_string |
| #print HTTP GET/POST url |
| printUntilCRLF(prev_payload_string) |
| #clean bpf_sessions & local_dictionary |
| try: |
| del bpf_sessions[current_Key] |
| del local_dictionary[binascii.hexlify(current_Key)] |
| except: |
| print ("error deleting from map or dictionary") |
| else: |
| #NOT last packet. containing part of HTTP GET/POST url splitted in N packets. |
| #append current payload |
| prev_payload_string += payload_string |
| #check if not size exceeding (usually HTTP GET/POST url < 8K ) |
| if (len(prev_payload_string) > MAX_URL_STRING_LEN): |
| print("url too long") |
| try: |
| del bpf_sessions[current_Key] |
| del local_dictionary[binascii.hexlify(current_Key)] |
| except: |
| print ("error deleting from map or dict") |
| #update dictionary |
| local_dictionary[binascii.hexlify(current_Key)] = prev_payload_string |
| else: |
| #first part of the HTTP GET/POST url is NOT present in local dictionary |
| #bpf_sessions contains invalid entry -> delete it |
| try: |
| del bpf_sessions[current_Key] |
| except: |
| print ("error del bpf_session") |
| |
| #check if dirty entry are present in bpf_sessions |
| if (((packet_count) % CLEANUP_N_PACKETS) == 0): |
| cleanup() |