Linux Internals: 2014

Tuesday, October 28, 2014

mongodb storage engine

Download mongodb and rocksdb

$ git clone https://github.com/mongodb/mongo.git   
$ git clone https://github.com/facebook/rocksdb.git

Build Rocksdb

$ cd rocksdb
$ make static_lib
$ cp librocksdb.a /usr/local/lib
$ cp -r include/* /usr/local/include

Build mongodb

$ cd ..
$ cd mongo

For MAC

$ scons --rocksdb --rocksdb --libc++ --osx-version-min-10.7 mongo mongod

For Linux

$ scons --rocksdb --rocksdb mongo mongod

Kickoff parallel build with -j <n>

Experiment with mongod/rocksdb in local build directory without installing

$ mkdir ./data
$ ./mongod  --dbpath ./data --storageEngine=rocksExperiment

Ready to interact with mongodb with rocksdb storage engine

 
$ ./mongo

Sample Program to Interface with RocksDB

========================================

#include <iostream>
#include "rocksdb/db.h"

using namespace std;

int
main()
{

    std::string     value;
    std::string     key1, key2;
    rocksdb::DB* db;
    rocksdb::Options options;
    options.create_if_missing = true;
    rocksdb::Status status = rocksdb::DB::Open(options, "/tmp/testdb", &db);

    cout << status.ToString() + "\n";
    key1 = string("foo");
    value = string("data");

    rocksdb::Status s = db->Get(rocksdb::ReadOptions(), key1, &value);
    if (s.ok()) {
        s = db->Put(rocksdb::WriteOptions(), key1, value);
        cout << "Key Found " + key1 + " Value " + value + "\n";
    } else {
        cout << "Key Not Found Insert Key\n";
        s = db->Put(rocksdb::WriteOptions(), key1, value);
        if (s.ok()) {
            s = db->Get(rocksdb::ReadOptions(), key1, &value);
            if (s.ok()) {
                cout << "Key Found " + key1 + " Value " + value + "\n";
            }
        }
    }
    //db->Delete(rocksdb::WriteOptions(), key1);
    /* Close DB */
    delete db;
}

Compile Sample Program

======================

g++ -std=c++0x rocks_db.cpp -lrocksdb -lpthread  -lz

Friday, September 5, 2014

CPUID instructions are complex instructions. It can be used to extract processor properties. CPUID instruction is a serializing instruction, i.e when executed, all concurrent, speculative and pipelined executions are stopped.

Information such as Processor, Cache/TLB, Cache Parameters, Performance Monitoring, L2 Cache information can be retrieved from user-space.

Refer to Intel Developer Instruction Manual.

typedef struct cpuid {
        uint32_t        eax;
        uint32_t        ebx;
        uint32_t        ecx;
        uint32_t        edx;
} cpuid_t;

static cpuid_t
get_cpu_id(uint32_t id)
{
        cpuid_t cpu;
        asm("mov %%ebx, %%esi\n\t"
                "cpuid\n\t"
                "xchg %%esi, %%ebx"
                : "=a" (cpu.eax),
                  "=S" (cpu.ebx),
                  "=c" (cpu.ecx),
                  "=d" (cpu.edx)
                : "a" (id)
        );
        return (cpu);
}

int
main()
{
        cpuid_t cpu;
        char    name[100];
        /* issue CPUID 0 instruction to read CPU information */
        cpu = get_cpu_id(0);
        printf("cpu.eax is %d\n", cpu.eax);
        sprintf(n, "%.4s%.4s%.4s\n", (char *)&cpu.ebx, (char *)&cpu.ecx, (char *)&cpu.edx);
        printf("processor name is %s\n", name);

        cpu = get_cpu_id(1);
        printf("Processor Type is %x\n", (cpu.eax & 0x00003000));
        printf("Family Type is %x\n", (cpu.eax & 0x00000F00) >> 8);

        cpu = get_cpu_id(0x80000008);
        printf("cache info phyMemory size %x\n", 1 << (cpu.eax & 0x0000000F));
        printf("cache info virtMemory %x\n", 1 << (cpu.eax >> 8));
}

Monday, August 25, 2014

DPDK playground

Compile DPDK library
===============
make config T=x86_64-native-linuxapp-gcc
or
make config T=i686-native-linuxapp-gcc

make

/* With No Huge Page*/
build/app/testpmd -c f -n 1 -w 02:00.0 --no-huge

or

/* With Huge Page */
build/app/testpmd -c f -n 1 -w 02:00.0

To view the current setting using the /proc entry for huge page
# cat /proc/sys/vm/nr_hugepages 0

To set the number of huge pages using /proc entry:
# echo 5 > /proc/sys/vm/nr_hugepages

Mount hugetlbfs

#mount -t hugetlbfs nodev /mnt/huge/

Build Applications
============

make install T=x86_64-native-linuxapp-gcc

Sunday, June 29, 2014

LinkedList in Kernel Example

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/list.h>
#include <linux/slab.h>

struct object_list {
        int     id;
        struct list_head list;
};

LIST_HEAD(obj_context);

static int __init
my_init(void)
{
        struct object_list *obj;
        obj = kmalloc(sizeof(struct object_list), GFP_KERNEL);
        obj->id = 5;

        list_add(&obj->list, &obj_context);

        return (0);
}

static void __exit
my_fini(void)
{
        struct list_head *pos, *q;

        list_for_each_safe(pos, q, &obj_context) {
                struct object_list *obj = NULL;
                obj = list_entry(pos, struct object_list, list);
                list_del(pos);
                kfree(obj);
        }
        return;
}

module_init(my_init);
module_exit(my_fini);

MODULE_DESCRIPTION("Sample Code");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Krishna Mohan");

I was an working on porting driver in user space and didn't want to write my own implementation. I copied the code from the kernel and tried in userspace and as expected it worked fine. I'm doing the same for other function which i will keep posting on my blog.
#include <stdio.h>
#include <stdlib.h>

#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)

struct list_head {
        struct list_head *next, *prev;
};

struct nvme_ns {
        struct list_head list;
        int num;
};

static inline void INIT_LIST_HEAD(struct list_head *list)
{
        list->next = list;
        list->prev = list;
}

static inline void __list_add(struct list_head *_new,
                              struct list_head *prev,
                              struct list_head *next)
{
        next->prev = _new;
        _new->next = next;
        _new->prev = prev;
        prev->next = _new;
}

static inline void list_add_tail(struct list_head *_new, struct list_head *head)
{
        __list_add(_new, head->prev, head);
}

#define container_of(ptr, type, member) ({                      \
        const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
        (type *)( (char *)__mptr - offsetof(type, member) ); })

#define list_entry(ptr, type, member)   \
        container_of(ptr, type, member)

#define list_for_each_entry(pos, head, member)                          \
        for (pos = list_entry((head)->next, typeof(*pos), member);      \
                &pos->member != (head);                                 \
                pos = list_entry(pos->member.next, typeof(*pos), member))

#define list_for_each_entry_safe(pos, n, head, member)                  \
        for (pos = list_entry((head)->next, typeof(*pos), member),      \
                n = list_entry(pos->member.next, typeof(*pos), member); \
                &pos->member != (head);                                 \
                pos = n, n = list_entry(n->member.next, typeof(*n), member))

static inline void __list_del(struct list_head *prev, struct list_head *next)
{
        next->prev = prev;
        prev->next = next;
}

#define LIST_POISON1    ((void *) 0x00100100)
#define LIST_POISON2    ((void *) 0x00200200)

static inline void list_del(struct list_head *entry)
{
        __list_del(entry->prev, entry->next);
        entry->next = (struct list_head *) LIST_POISON1;
        entry->prev = (struct list_head *) LIST_POISON2;
}

int
main()
{
        struct list_head name;
        struct nvme_ns *ns, *next;

        INIT_LIST_HEAD(&name);

        ns = malloc(sizeof(struct nvme_ns));
        printf("alloc %p\n", ns);
        ns->num = 1;

        list_add_tail(&ns->list, &name);

        ns = malloc(sizeof(struct nvme_ns));
        printf("alloc %p\n", ns);
        ns->num = 2;
        list_add_tail(&ns->list, &name);

        list_for_each_entry(ns, &name, list)
                printf("num is %d\n", ns->num);

        list_for_each_entry_safe(ns, next, &name, list) {
                list_del(&ns->list);
                printf("free %p\n", ns);
                free(ns);
        }
        return (0);
}

Thursday, June 19, 2014

Devtools installation on Centos

Default gcc fails to compile linux 3.x source code. It required me to upgrade
devtools for CentOS.

cd /etc/yum.repos.d

wget http://people.centos.org/tru/devtools-2/devtools-2.repo

yum install devtoolset-2-gcc

              yum install devtoolset-2-binutils

export PATH=/opt/rh/.....:$PATH

check gcc -v which should be higher then default gcc version.

Monday, May 12, 2014

All About Ftrace

Ftrace is a Kernel function tracer. It uses the file system debugfs.

To use Ftrace, these options needs to be enabled if it's not.

CONFIG_FTRACE,
CONFIG_HAVE_DYNAMIC_FTRACE,
CONFIG_HAVE_FUNCTION_TRACER,
CONFIG_HAVE_FUNCTION_GRAPH_TRACER,
CONFIG_STACKTRACE

To mount debugfs

mount -t debugfs nodev /sys/kernel/debug

If /sys/kernel/debug is present re-run the above command again.

ftrace directory will be created under /sys/kernel/debug/tracing

The directory contains all the control knobs in regard to kernel tracing.

Some of the important files can be referred when using ftrace
1) available_filter_funtions: All the functions ftrace is able to trace.
2) available_tracer: List all the tracer compiled into the kernel.
3) current_tracer: shows the currently selected tracer.
4) trace : It holds the output of what being traced in readable format.
5) trace_options: To control the level of output in trace output
                 To enable block tracing echo block > trace_options
                 To disable block tracing echo noblock > trace_options


6) tracing_enabled : To start or stop tracing activity

To enable ftrace

echo function > /sys/kernel/debug/tracing/current_tracer

To enable/disable tracing

echo 1 > tracing_on : To enable tracing
echo 0 > tracing_on   : To disable tracing
echo > trace   : To clear trace log file

echo nop > current_tracer

To Trace or Monitor Block IO
echo 1 > events/block/enable (enable block I/O subsystem)

cat set_event (To display all the subsystem event enabled)

echo 1 > tracing_on
run your program
echo 0 > tracing_on

cat trace to output the ftrace output.

Example of tracing a specific process

traceme.sh

#!/bin/sh
DEBUGFS=`grep debugfs /proc/mounts | awk '{ print $2; }'`
echo nop > $DEBUGFS/tracing/current_tracer
echo > $DEBUGFS/tracing/trace
echo $1
echo $$ > $DEBUGFS/tracing/set_ftrace_pid
echo function > $DEBUGFS/tracing/current_tracer
#echo function_graph > $DEBUGFS/tracing/current_tracer
echo 1 > $DEBUGFS/tracing/tracing_on
exec $*
#echo nop > $DEBUGFS/tracing/current_tracer
echo 0 > $DEBUGFS/tracing/tracing_on

echo sys_* > set_ftrace_filter
echo vfs_* >> set_ftrace_filter

traceme.sh ls -al

Thursday, May 8, 2014

All About NVME

NVMe stands for Non-Volatile Memory over PCIe. Designed for SSD and for low latency response.

Architecture of NVME on linux looks like this

NVMe controller register provides BAR0 and BAR1 for mapping internal control register.

NVMe HCI model has concept of Completion Queue, Submission Queue and Doorbell register.

There are 2 type of Queues
1) Admin Queues
2) I/O Queues

Host Software creates Admin Queue first (Admin Queue Structure Initialization etc..)

Host uses Admin Commands (Submitted to Admin Queue) to create I/O queue pair (Submission and Completion Queue)

Below is the layout of Control Register of NVMe. Host writes to Admin SQ (0x28h) and CQ Base (0x30h) Address in local memory mapped address.

Important Registers
Admin Queue Attributes (AQA: 0x24h) ASQ0 Size/ACQ0 Size.

Assign base address to ASQ and ACQ based on ASQ and ACQ size to submit any admin command.

Host create I/O Submission and Completion Queue by putting Admin command in new Admin Queue.

Some of the Admin Commands are
1) Delete IO SQ
2) Create IO SQ
3) Create IO CQ
4) Delete IO CQ
5) Identify,
6) Firmware Activate/Image Download

Multiple I/O Submission Queues are possible
1) Load Distribution across CPU cores
2) One CQ serving multiple SQ.
3) Avoid locking overhead.
4) Queue priority

Once Submission Queue is created host can submit I/O Commands

Support IO Commands are
1) Flush
2) Write
3) Read

Submitting IO Command Host places address of data buffer into submission queue and trigger SQ tail Doorbell register.

NVME Doorbell follows a Producer/Consumer model

Host acts as
1) Producer of commands -> updates SQ Tail Pointer
2) Consumer of completions -> updates CQ Head Pointer

Controller acts as
1) Consumer of Commands ->update SQ Head Pointer
2) Producer of completions -> updates tail of CQ pointer

Lets consider a scenario

Initial State

SQ1 = { empty }
CQ1 = { empty }
SQ1TailDB = {0}
SQ1HeadDB = {0}
CQ1TailDB = {0}
CQ1HeadDB = {0}

Host add 3 commands

SQ1= {CMD0, CMD1, CMD2, ..... };
SQ1TailDB = {3}

Controller Fetches 3 commands
SQ1HeadDB = {3}
SQ1= {empty} //marked empty

Controller Post completions (Let's say it post 2 completions)
CQ1 = {CMD0, CMD1, empty ......}
CQ1TailDB = {2}

Host is interrupt when CQ1TailDB is updated
Host reads CQ1 and update CQ1HeadDB.

CQ1 = {empty}
CQ1HeadDB={2}

Each command submitted to SQ is 64bytes in size. Command DW0, NSID, Metadata pointer, PRP Entry 1 and PRP Entry 2 have common definitions for all Admin Commands and NVM commands.

Command DW0 format is defined in below figure.