| /* |
| * Copyright 2012-2014 Google Inc. All rights reserved. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /* |
| * A program to test/validate realtime disk performance under various |
| * conditions. |
| */ |
| #include <assert.h> |
| #include <dirent.h> |
| #include <errno.h> |
| #include <fcntl.h> |
| #include <memory.h> |
| #include <pthread.h> |
| #include <sched.h> |
| #include <stdint.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <sys/sendfile.h> |
| #include <sys/mman.h> |
| #include <sys/stat.h> |
| #include <sys/syscall.h> |
| #include <sys/types.h> |
| #include <time.h> |
| #include <unistd.h> |
| #include <netinet/tcp.h> |
| #include <arpa/inet.h> |
| #include "ioprio.h" |
| |
| |
| #ifndef SCHED_IDLE |
| // not defined in glibc nor uclibc for some reason, but in Linux since 2.6.23 |
| #define SCHED_IDLE 5 |
| #endif |
| |
| #define PCT_MIN_INIT 9999 // impossible percentage |
| |
| static int _posix_fallocate(int fd, __off64_t offset, __off64_t len) { |
| #ifdef __UCLIBC__ |
| return syscall( |
| SYS_fallocate, fd, 0, |
| __LONG_LONG_PAIR((uint32_t)(offset >> 32), (uint32_t)offset), |
| __LONG_LONG_PAIR((uint32_t)(len >> 32), (uint32_t)len)); |
| #else |
| return posix_fallocate(fd, offset, len); |
| #endif |
| } |
| |
| |
| struct TaskStatus { |
| int tasknum; |
| volatile long long counter; |
| volatile long long total_spare_pct; |
| volatile long long spare_pct_cnt; |
| volatile long long spare_pct_min; |
| int sock_fd; // used by reader/receiver for sendfile option |
| }; |
| |
| #define MAX_TASKS 128 |
| static struct TaskStatus *spinners[MAX_TASKS]; |
| static struct TaskStatus writers[MAX_TASKS]; |
| static struct TaskStatus readers[MAX_TASKS]; |
| static struct TaskStatus receivers[MAX_TASKS]; |
| |
| #define MAX_FILE_SIZE (2*1000*1000*1000) |
| #define MAX_BUF (128*1024*1024) |
| static char *buf; |
| |
| // command-line arguments |
| static int timeout = -1; |
| static int nspins = 0; |
| static int nwriters = 0; |
| static int nreaders = 0; |
| static int blocksize_write = 128*1024; |
| static int blocksize_read = 0; |
| static int bytes_per_sec = 2*1024*1024; |
| static int so_rcvbuf = 0; |
| static int so_sndbuf = 0; |
| static int keep_old_files = 0; |
| static int use_stagger = 0; |
| static int use_o_direct_write = 0; |
| static int use_o_direct_read = 0; |
| static int use_sendfile = 0; |
| static int use_mmap = 0; |
| static int use_fallocate = 0; |
| static int use_fsync = 0; |
| static int use_realtime_prio = 0; |
| static int use_ionice = 0; |
| static int be_verbose = 0; |
| static int print_extra_stats = 0; |
| |
| #define CHECK(x) _check(#x, x) |
| |
| static void _check(const char *str, int result) { |
| if (!result) { |
| perror(str); |
| assert(result); |
| } |
| } |
| |
| |
| // Returns the kernel monotonic timestamp in microseconds. |
| static long long ustime(void) { |
| struct timespec ts; |
| if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0) { |
| perror("clock_gettime"); |
| exit(7); // really should never happen, so don't try to recover |
| } |
| return ts.tv_sec * 1000000LL + ts.tv_nsec / 1000; |
| } |
| |
| |
| static void _set_priority(int policy, int prio) { |
| struct sched_param sp; |
| memset(&sp, 0, sizeof(sp)); |
| sp.sched_priority = prio; |
| CHECK(sched_setscheduler(0, policy, &sp) == 0); |
| } |
| |
| |
| static long _pagesize(void) { |
| static long pagesize; |
| if (!pagesize) { |
| pagesize = sysconf(_SC_PAGESIZE); |
| fprintf(stderr, "pagesize=%ld\n", pagesize); |
| } |
| return pagesize; |
| } |
| |
| |
| // write one byte every PAGESIZE bytes inside the buffer, thus forcing the |
| // kernel to actually page all the accessed pages out to disk (eventually). |
| static void _page_out(char *buf, size_t count) { |
| static long seed; |
| if (!seed) { |
| seed = random(); |
| } |
| for (size_t i = 0; i < count; i += _pagesize()) { |
| buf[i] = ++seed; |
| } |
| } |
| |
| |
| // read one byte every PAGESIZE bytes inside the buffer, thus forcing the |
| // kernel to actually page all the accessed pages in from disk. |
| static volatile char page_tempbyte; |
| static void _page_in(char *buf, size_t count) { |
| for (size_t i = 0; i < count; i += _pagesize()) { |
| page_tempbyte = buf[i]; |
| } |
| } |
| |
| |
| static ssize_t _do_write(int fd, char *buf, size_t count) { |
| assert(buf); |
| if (use_mmap) { |
| off_t oldpos = lseek(fd, 0, SEEK_CUR); |
| struct stat st; |
| CHECK(fstat(fd, &st) >= 0); |
| if (st.st_size < oldpos + (ssize_t)count) { |
| if (ftruncate(fd, oldpos + count) < 0) { |
| // probably disk full |
| return -1; |
| } |
| } |
| char *wbuf; |
| CHECK((wbuf = mmap(NULL, count, PROT_WRITE, MAP_SHARED, fd, oldpos)) |
| != MAP_FAILED); |
| off_t newpos = lseek(fd, count, SEEK_CUR); |
| count = newpos - oldpos; |
| _page_out(wbuf, count); |
| CHECK(munmap(wbuf, count) >= 0); |
| return count; |
| } else { |
| // non-mmap version |
| return write(fd, buf, count); |
| } |
| } |
| |
| |
| static ssize_t _do_read(int fd, char **buf, size_t count, int socket_fd) { |
| assert(buf); |
| if (use_mmap) { |
| off_t oldpos = lseek(fd, 0, SEEK_CUR); |
| if (*buf) { |
| CHECK(munmap(*buf, count) >= 0); |
| *buf = NULL; |
| } |
| CHECK((*buf = mmap(NULL, count, PROT_READ, MAP_SHARED, fd, oldpos)) |
| != MAP_FAILED); |
| off_t newpos = lseek(fd, count, SEEK_CUR); |
| count = newpos - oldpos; |
| _page_in(*buf, count); |
| return count; |
| } else if (use_sendfile && socket_fd >= 0) { |
| // send the length as 32-bit number, followed by the data block |
| uint32_t blocksz = count; |
| CHECK(send(socket_fd, &blocksz, sizeof(blocksz), 0) == sizeof(blocksz)); |
| ssize_t ret = sendfile64(socket_fd, fd, 0, count); |
| if (be_verbose) { |
| fprintf(stderr, "sendfile sent %ld/%ld bytes to socket %d\n", |
| (long)ret, (long)count, socket_fd); |
| } |
| return ret; |
| } else { |
| // non-mmap version |
| if (!*buf) { |
| CHECK(posix_memalign((void **)buf, _pagesize(), blocksize_read) == 0); |
| } |
| return read(fd, *buf, count); |
| } |
| } |
| |
| |
| static void *spinner(void *_status) { |
| struct TaskStatus *status = _status; |
| fprintf(stderr, "s#%d ", status->tasknum); |
| |
| // use IDLE priority so that this task *never* runs unless nobody else |
| // is interested. Thus the spinners should only count upward if there's |
| // an actual available idle CPU core to run the task. |
| _set_priority(SCHED_IDLE, 0); |
| |
| volatile long long *counter = &status->counter; |
| while (1) { |
| // Note: it is not necessarily safe to increment a counter here without |
| // a lock (since it's read from other threads). However, introducing |
| // locking or atomic operations would slow down the counter and possibly |
| // introduce CPU barriers or cache line flushes, which defeats the main |
| // purpose of this counter: counting raw, uninterrupted CPU cycles. |
| // Also this number isn't critical to the operation of the program, so |
| // occasional mis-reads of the counter should not be harmful and should |
| // be pretty obvious (an occasional, wildly wrong reading). |
| // Locking would be much more critical if we incremented a given counter |
| // from more than one thread, but we never do that. |
| (*counter)++; |
| } |
| return NULL; |
| } |
| |
| |
| static void _create_socketpair(int *snd_fd, int *rcv_fd) { |
| // create server socket |
| int server_fd; |
| CHECK((server_fd = socket(AF_INET, SOCK_STREAM, 0)) >= 0); |
| int flags = 1; |
| CHECK(setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &flags, |
| sizeof(flags)) == 0); |
| struct sockaddr_in serveraddr; |
| serveraddr.sin_family = AF_INET; |
| serveraddr.sin_addr.s_addr = inet_addr("127.0.0.1"); |
| serveraddr.sin_port = htons(0); |
| |
| CHECK(bind(server_fd, (struct sockaddr*)&serveraddr, |
| sizeof(serveraddr)) == 0); |
| socklen_t len = sizeof(struct sockaddr); |
| CHECK(getsockname(server_fd, (struct sockaddr *)&serveraddr, &len) == 0); |
| int port = ntohs(serveraddr.sin_port); |
| |
| CHECK(listen(server_fd, 1) == 0); |
| |
| // create sender socket |
| int sender_fd; |
| CHECK((sender_fd = socket(AF_INET, SOCK_STREAM, 0)) >= 0); |
| flags = 1; |
| CHECK(setsockopt(sender_fd, IPPROTO_TCP, TCP_NODELAY, &flags, |
| sizeof(int)) == 0); |
| flags = 4; |
| CHECK(setsockopt(sender_fd, SOL_SOCKET, SO_PRIORITY, &flags, |
| sizeof(flags)) == 0); |
| len = sizeof(int); |
| int snd_size, old_snd_size = -1; |
| if (so_sndbuf) { |
| CHECK(getsockopt(sender_fd, SOL_SOCKET, SO_SNDBUF, &old_snd_size, |
| &len) == 0); |
| CHECK(setsockopt(sender_fd, SOL_SOCKET, SO_SNDBUF, &so_sndbuf, |
| sizeof(int)) == 0); |
| } |
| len = sizeof(int); |
| CHECK(getsockopt(sender_fd, SOL_SOCKET, SO_SNDBUF, &snd_size, &len) == 0); |
| |
| // connect sender to server |
| memset(&serveraddr, 0, sizeof(serveraddr)); |
| serveraddr.sin_family = AF_INET; |
| serveraddr.sin_addr.s_addr = inet_addr("127.0.0.1"); |
| serveraddr.sin_port = htons(port); |
| CHECK(connect(sender_fd, (struct sockaddr *)&serveraddr, |
| sizeof(struct sockaddr)) == 0); |
| |
| // reader accepts connection request from sender |
| struct sockaddr clientaddr; |
| len = sizeof(struct sockaddr_in); |
| int receiver_fd; |
| CHECK((receiver_fd = accept(server_fd, &clientaddr, &len)) >= 0); |
| close(server_fd); |
| flags = 1; |
| CHECK(setsockopt(receiver_fd, IPPROTO_TCP, TCP_NODELAY, &flags, |
| sizeof(flags)) == 0); |
| flags = 4; |
| CHECK(setsockopt(receiver_fd, SOL_SOCKET, SO_PRIORITY, &flags, |
| sizeof(flags)) == 0); |
| len = sizeof(int); |
| int rcv_size, old_rcv_size = -1; |
| if (so_rcvbuf) { |
| CHECK(getsockopt(receiver_fd, SOL_SOCKET, SO_RCVBUF, &old_rcv_size, |
| &len) == 0); |
| CHECK(setsockopt(receiver_fd, SOL_SOCKET, SO_RCVBUF, &so_rcvbuf, |
| sizeof(int)) == 0); |
| } |
| len = sizeof(int); |
| CHECK(getsockopt(receiver_fd, SOL_SOCKET, SO_RCVBUF, &rcv_size, &len) == 0); |
| |
| fprintf(stderr, "created socket pair, sender(%d) with so_snd_size:%d " |
| "(was %d), receiver(%d) with so_rcv_size:%d (was %d)\n", sender_fd, |
| snd_size / 2, old_snd_size / 2, receiver_fd, rcv_size / 2, |
| old_rcv_size / 2); |
| *snd_fd = sender_fd; |
| *rcv_fd = receiver_fd; |
| } |
| |
| |
| static void *receiver(void *_status) { |
| struct TaskStatus *status = _status; |
| |
| // use priority higher than the spinner (IDLE) but lower than the writers |
| // and readers (FIFO, prio=10) |
| if (use_realtime_prio) _set_priority(SCHED_FIFO, 1); |
| |
| fprintf(stderr, "n#%d ", status->tasknum); |
| |
| // dummy buffer that we receive all data into |
| void *blackhole; |
| CHECK((blackhole = malloc(2 * blocksize_read)) != NULL); |
| |
| while (1) { |
| ssize_t bytes = recv(status->sock_fd, blackhole, 2 * blocksize_read, 0); |
| CHECK(bytes >= 0); |
| if (bytes == 0) { |
| // socket was closed |
| fprintf(stderr, "receiver socket %d closed\n", status->sock_fd); |
| break; |
| } |
| } |
| fprintf(stderr, "receiver thread exiting!\n"); |
| return NULL; |
| } |
| |
| |
| static void *writer(void *_status) { |
| struct TaskStatus *status = _status; |
| fprintf(stderr, "w#%d ", status->tasknum); |
| |
| int nblocks = MAX_FILE_SIZE / blocksize_write; |
| long long blockdelay = blocksize_write * 1000000LL / bytes_per_sec; |
| |
| if (use_realtime_prio) _set_priority(SCHED_FIFO, 10); |
| if (use_stagger) { |
| // the 0.5 is to stagger the writers in between the staggered readers, |
| // in case nreaders==nwriters. |
| usleep(blockdelay * (0.5 + status->tasknum) / nwriters); |
| } |
| |
| long long starttime = ustime(); |
| for (int fileno = 0; fileno < 1000000; fileno++) { |
| char filename[1024]; |
| sprintf(filename, "db.%d.%d.tmp", status->tasknum, fileno); |
| int fd; |
| mode_t mode = O_RDWR|O_CREAT; |
| if (use_o_direct_write) mode |= O_DIRECT; |
| CHECK((fd = open(filename, mode, 0666)) >= 0); |
| for (int blocknum = 0; blocknum < nblocks; blocknum++) { |
| if (use_fallocate) { |
| struct stat st; |
| CHECK(fstat(fd, &st) == 0); |
| if (st.st_size <= blocknum * blocksize_write) { |
| CHECK(_posix_fallocate(fd, 0, blocknum * blocksize_write + |
| 100*1024*1024) == 0); |
| } |
| } |
| CHECK(_do_write(fd, buf + blocknum * 4096, blocksize_write) > 0); |
| if (use_fsync) fdatasync(fd); |
| long long now = ustime(); |
| starttime += blockdelay; |
| long long spare_time = starttime - now; |
| long long spare_pct = 100 * spare_time / blockdelay; |
| status->total_spare_pct += spare_pct; |
| if (spare_pct < status->spare_pct_min) status->spare_pct_min = spare_pct; |
| status->spare_pct_cnt++; |
| if (spare_time < 0) { |
| // disk fell behind |
| while (now > starttime) { |
| status->counter++; |
| starttime += blockdelay; |
| } |
| } else { |
| // ahead of schedule, wait until next timeslot |
| usleep(spare_time); |
| } |
| } |
| close(fd); |
| } |
| assert(!"created an impossible number of files"); |
| return NULL; |
| } |
| |
| |
| static int open_random_file(int mode) { |
| DIR *dir; |
| struct dirent dentbuf, *dent = NULL; |
| |
| while (1) { |
| CHECK((dir = opendir(".")) != NULL); |
| |
| int count = 0; |
| while (readdir_r(dir, &dentbuf, &dent) == 0 && dent != NULL) { |
| struct stat st; |
| if (stat(dent->d_name, &st) < 0) continue; |
| if (st.st_size > blocksize_read) { |
| count++; |
| } |
| } |
| if (!count) { |
| fprintf(stderr, "reader: no big files to read yet.\n"); |
| closedir(dir); |
| sleep(1); |
| continue; |
| } |
| |
| int want = random() % count, cur = 0; |
| rewinddir(dir); |
| while (readdir_r(dir, &dentbuf, &dent) == 0 && dent != NULL) { |
| struct stat st; |
| if (stat(dent->d_name, &st) < 0) continue; |
| if (st.st_size > blocksize_read) { |
| if (cur == want) { |
| closedir(dir); |
| return open(dent->d_name, mode); |
| } |
| cur++; |
| } |
| } |
| |
| // if we get here, one of the expected files has disappeared; try again. |
| closedir(dir); |
| } |
| // not reached |
| assert(!"should never get here"); |
| } |
| |
| |
| static void *reader(void *_status) { |
| struct TaskStatus *status = _status; |
| fprintf(stderr, "r#%d ", status->tasknum); |
| |
| long long blockdelay = blocksize_read * 1000000LL / bytes_per_sec; |
| char *rbuf = NULL; |
| |
| if (use_realtime_prio) _set_priority(SCHED_FIFO, 10); |
| if (use_stagger) usleep(blockdelay * status->tasknum / nreaders); |
| |
| while (1) { |
| int fd; |
| mode_t mode = O_RDONLY; |
| if (use_o_direct_read) mode |= O_DIRECT; |
| CHECK((fd = open_random_file(mode)) >= 0); |
| struct stat st; |
| CHECK(fstat(fd, &st) == 0); |
| |
| // start reading at a random offset into the file. If there aren't too |
| // many files and we have lots of parallel readers, we might otherwise |
| // get two readers reading the same blocks from the same file at the |
| // same time, and if the disk cache kicks in, that reduces disk load |
| // unnecessarily. |
| off_t start_offset = (random() % (st.st_size / 65536 + 1)) * 65536; |
| lseek(fd, start_offset, SEEK_SET); |
| |
| long long starttime = ustime(), got, totalbytes = start_offset; |
| // we intentionally stop reading after we reach the *original* size of |
| // the file, even if the file has grown since then. Continuing to read |
| // a growing file (presumably being written by a separate writer thread) |
| // seems like a good test, because that's how the system works in real |
| // life. But it turns out to be so beneficial (when not using O_DIRECT, |
| // the kernel can avoid doing disk reads) that it gets in the way of our |
| // benchmark. We need to check worst-case performance (reading old files |
| // while new ones are being written) not average case. |
| while (totalbytes + blocksize_read < st.st_size && |
| (got = _do_read(fd, &rbuf, blocksize_read, status->sock_fd)) > 0) { |
| long long now = ustime(); |
| totalbytes += got; |
| starttime += blockdelay; |
| long long spare_time = starttime - now; |
| long long spare_pct = 100 * spare_time / blockdelay; |
| status->total_spare_pct += spare_pct; |
| status->spare_pct_cnt++; |
| if (spare_pct < status->spare_pct_min) status->spare_pct_min = spare_pct; |
| if (spare_time < 0) { |
| // disk fell behind |
| while (now > starttime) { |
| status->counter++; |
| starttime += blockdelay; |
| } |
| } else { |
| // ahead of schedule, wait until next timeslot |
| usleep(spare_time); |
| } |
| } |
| close(fd); |
| } |
| return NULL; |
| } |
| |
| |
| static long long count_spins(void) { |
| static long long last_end, last_total; |
| long long total = 0; |
| for (int i = 0; i < nspins; i++) { |
| total += spinners[i]->counter; |
| } |
| long long this_end = ustime(), this_spin; |
| if (last_end) { |
| this_spin = (total - last_total) / (this_end - last_end); |
| } else { |
| this_spin = 0; |
| } |
| last_end = this_end; |
| last_total = total; |
| return this_spin; |
| } |
| |
| |
| static long long sum_tasks(struct TaskStatus *array, int nelems) { |
| long long total = 0; |
| for (int i = 0; i < nelems; i++) { |
| total += array[i].counter; |
| } |
| return total; |
| } |
| |
| |
| static long long avg_spare_time(struct TaskStatus *array, int nelems) { |
| long long total = 0; |
| for (int i = 0; i < nelems; i++) { |
| if (array[i].spare_pct_cnt) { |
| total += array[i].total_spare_pct / array[i].spare_pct_cnt; |
| array[i].total_spare_pct = array[i].spare_pct_cnt = 0; |
| } |
| } |
| return total / nelems; |
| } |
| |
| |
| static long long min_spare_time(struct TaskStatus *array, int nelems) { |
| long long min_pct = array[0].spare_pct_min; |
| array[0].spare_pct_min = PCT_MIN_INIT; |
| for (int i = 1; i < nelems; i++) { |
| if (array[i].spare_pct_min < min_pct) |
| min_pct = array[i].spare_pct_min; |
| array[i].spare_pct_min = PCT_MIN_INIT; |
| } |
| return min_pct; |
| } |
| |
| |
| static void usage(void) { |
| fprintf(stderr, |
| "\n" |
| "Usage: diskbench [options]\n" |
| " -h, -? This help message\n" |
| " -t ... Timeout (number of seconds to run test)\n" |
| " -i ... Number of idle spinners (to occupy CPU threads)\n" |
| " -w ... Number of parallel writers (creating files)\n" |
| " -r ... Number of parallel readers (reading files)\n" |
| " -b ... Block size (kbyte size of a single read/write)\n" |
| " -c ... Alternative block size for reading (kbyte)\n" |
| " -s ... Speed (kbytes read/written per sec, per stream)\n" |
| " -m ... Socket receive buffer size in KB (for sendfile)\n" |
| " -z ... Socket send buffer size in KB (for sendfile)\n" |
| " -K Keep old temp output files from previous run\n" |
| " -S Stagger reads and writes evenly (default: clump them)\n" |
| " -D Use O_DIRECT for writing\n" |
| " -O Use O_DIRECT for reading\n" |
| " -N Use sendfile to send read data through a socket\n" |
| " to a local client\n" |
| " -M Use mmap()\n" |
| " -F Use fallocate()\n" |
| " -Y Use fdatasync() after writing\n" |
| " -R Use CPU real-time priority\n" |
| " -I Use ionice real-time disk priority\n" |
| " -E Print extra stats\n" |
| " -v Verbose output\n"); |
| exit(99); |
| } |
| |
| |
| int main(int argc, char **argv) { |
| srandom(time(NULL)); |
| |
| int opt; |
| while ((opt = getopt(argc, argv, "?ht:i:w:r:b:c:s:m:z:KSDONMFYRIEv")) != -1) { |
| switch (opt) { |
| case '?': |
| case 'h': |
| usage(); |
| break; |
| case 't': |
| timeout = atoi(optarg); |
| break; |
| case 'i': |
| nspins = atoi(optarg); |
| break; |
| case 'w': |
| nwriters = atoi(optarg); |
| break; |
| case 'r': |
| nreaders = atoi(optarg); |
| break; |
| case 'b': |
| blocksize_write = atoi(optarg) * 1024; |
| break; |
| case 'c': |
| blocksize_read = atoi(optarg) * 1024; |
| break; |
| case 's': |
| bytes_per_sec = atoi(optarg) * 1024; |
| break; |
| case 'm': |
| so_rcvbuf = atoi(optarg) * 1024; |
| break; |
| case 'z': |
| so_sndbuf = atoi(optarg) * 1024; |
| break; |
| case 'K': |
| keep_old_files = 1; |
| break; |
| case 'S': |
| use_stagger = 1; |
| break; |
| case 'D': |
| use_o_direct_write = 1; |
| break; |
| case 'O': |
| use_o_direct_read = 1; |
| break; |
| case 'N': |
| use_sendfile = 1; |
| break; |
| case 'M': |
| use_mmap = 1; |
| break; |
| case 'F': |
| use_fallocate = 1; |
| break; |
| case 'Y': |
| use_fsync = 1; |
| break; |
| case 'R': |
| use_realtime_prio = 1; |
| break; |
| case 'I': |
| use_ionice = 1; |
| break; |
| case 'E': |
| print_extra_stats = 1; |
| break; |
| case 'v': |
| be_verbose = 1; |
| break; |
| } |
| } |
| |
| if (nspins > MAX_TASKS || nreaders > MAX_TASKS || nwriters > MAX_TASKS) { |
| fprintf(stderr, "\nfatal: idlers, readers, and writers must all be <= %d\n", |
| MAX_TASKS); |
| return 8; |
| } |
| |
| if (!nspins && !nreaders && !nwriters) { |
| fprintf(stderr, "\nfatal: must specify at least one of -i, -r, -w\n"); |
| return 9; |
| } |
| |
| if (!blocksize_read) blocksize_read = blocksize_write; |
| |
| CHECK(posix_memalign((void **)&buf, _pagesize(), MAX_BUF) == 0); |
| for (int i = 0; i < MAX_BUF; i++) { |
| buf[i] = i % 257; |
| } |
| |
| if (nwriters == 0) { |
| fprintf(stderr, "not clearing old temp files (-w 0)\n"); |
| } else if (keep_old_files) { |
| fprintf(stderr, "not clearing old temp files (-K)\n"); |
| } else { |
| fprintf(stderr, "clearing old temp files.\n"); |
| CHECK(system("rm -f db.*.*.tmp") == 0); |
| } |
| |
| fprintf(stderr, "syncing disks.\n"); |
| sync(); |
| |
| fprintf(stderr, "starting: %d idlers, %d readers, %d writers\n", |
| nspins, nreaders, nwriters); |
| |
| for (int i = 0; i < nspins; i++) { |
| // spinners[] could just be an array of structs, instead of pointers to |
| // structs, but we want to make sure the counters don't all share the |
| // same cache line. Prevening this sharing more than doubles the |
| // counting rate on my x86_64 (Xeon X5650) machine, although it seems |
| // to make no difference on BCM7425. I don't want an endless stream of |
| // conflicting cache line flushes to artificially inflate CPU usage. |
| spinners[i] = calloc(1, sizeof(struct TaskStatus)); |
| spinners[i]->tasknum = i + 1; |
| pthread_t thread; |
| CHECK(pthread_create(&thread, NULL, spinner, spinners[i]) == 0); |
| } |
| |
| for (int i = 0; i < nspins; i++) { |
| spinners[i]->counter = 0; |
| } |
| count_spins(); // initialize timings |
| sleep(1); // run for one cycle without any non-spinner activity |
| long long best_spin = count_spins(); |
| if (!best_spin) best_spin = 1; |
| fprintf(stderr, "\nidle spins:%lld\n", best_spin); |
| |
| if (use_ionice) { |
| int realtime = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0); |
| CHECK(ioprio_set(IOPRIO_WHO_PROCESS, getpid(), realtime) != -1); |
| } |
| |
| for (int i = 0; i < nwriters; i++) { |
| memset(&writers[i], 0, sizeof(writers[i])); |
| writers[i].tasknum = i; |
| writers[i].spare_pct_min = PCT_MIN_INIT; |
| pthread_t thread; |
| CHECK(pthread_create(&thread, NULL, writer, &writers[i]) == 0); |
| } |
| |
| for (int i = 0; i < nreaders; i++) { |
| memset(&readers[i], 0, sizeof(readers[i])); |
| if (use_sendfile) { |
| memset(&receivers[i], 0, sizeof(receivers[i])); |
| _create_socketpair(&readers[i].sock_fd, &receivers[i].sock_fd); |
| receivers[i].tasknum = i; |
| pthread_t thread; |
| CHECK(pthread_create(&thread, NULL, receiver, &receivers[i]) == 0); |
| } else { |
| readers[i].sock_fd = -1; // disable |
| } |
| readers[i].tasknum = i; |
| readers[i].spare_pct_min = PCT_MIN_INIT; |
| pthread_t thread; |
| CHECK(pthread_create(&thread, NULL, reader, &readers[i]) == 0); |
| } |
| |
| // that cycle was filled with startup traffic, just ignore it |
| usleep(100*1000); |
| count_spins(); |
| fprintf(stderr, "\n"); |
| |
| long long count = 0; |
| while (timeout == -1 || count < timeout) { |
| sleep(1); |
| long long this_spin = count_spins(); |
| if (this_spin > best_spin) best_spin = this_spin; |
| if (print_extra_stats) { |
| printf("%5lld spins:%lld/%lld cpu:%.2f%% overruns: w=%lld r=%lld " |
| "avg/min spare_time: w=%lld/%lld%% r=%lld/%lld%%\n", |
| ++count, |
| this_spin, best_spin, |
| 100 * (1-(this_spin*1.0/best_spin)), |
| sum_tasks(writers, nwriters), |
| sum_tasks(readers, nreaders), |
| avg_spare_time(writers, nwriters), |
| min_spare_time(writers, nwriters), |
| avg_spare_time(readers, nreaders), |
| min_spare_time(readers, nreaders)); |
| } else { |
| printf("%5lld spins:%lld/%lld cpu:%.2f%% overruns: w=%lld r=%lld\n", |
| ++count, |
| this_spin, best_spin, |
| 100 * (1-(this_spin*1.0/best_spin)), |
| sum_tasks(writers, nwriters), |
| sum_tasks(readers, nreaders)); |
| } |
| fflush(stdout); |
| } |
| return 0; |
| } |