/* * $Id: Profiler.c,v 1.7 2007/04/25 11:30:16 adrian Exp $ * * DEBUG: section 81 CPU Profiling Routines * AUTHOR: Andres Kroonmaa, Sep.2000 * * SQUID Internet Object Cache http://squid.nlanr.net/Squid/ * ---------------------------------------------------------- * * Squid is the result of efforts by numerous individuals from the * Internet community. Development is led by Duane Wessels of the * National Laboratory for Applied Network Research and funded by the * National Science Foundation. Squid is Copyrighted (C) 1998 by * the Regents of the University of California. Please see the * COPYRIGHT file for full details. Squid incorporates software * developed and/or copyrighted by other sources. Please see the * CREDITS file for full details. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA. * */ /* * CPU Profiling implementation. * * This library implements the Probes needed to gather stats. * See src/ProfStats.c which implements historical recording and * presentation in CacheMgr.cgi. * * For timing we prefer on-CPU ops that retrieve cpu ticks counter. * For Intel, this is "rdtsc", which is 64-bit counter that virtually * never wraps. For alpha, this is "rpcc" which is 32-bit counter and * wraps every few seconds. Currently, no handling of wrapping counters * is implemented. Other CPU's are also not covered. Potentially all * modern CPU's has similar counters. * * Usage. * Insert macro PROF_state(probename) in strategic places in code. * PROF_start(probename); * ... section of code measured ... * PROF_stop(probename); * * probename must be added to profiling.h into xprof_type enum list * with prepended "XPROF_" string. * * Description. * PROF gathers stats per probename into structures. It indexes these * structures by enum type index in an array. * * PROF records best, best, average and worst values for delta time, * also, if UNACCED is defined, it measures "empty" time during which * no probes are in measuring state. This allows to see time "unaccounted" * for. If OVERHEAD is defined, additional calculations are made at every * probe to measure approximate overhead of the probe code itself. * * Probe data is stored in linked-list, so the more probes you define, * the more overhead is added to find the deepest nested probe. To reduce * average overhead, linked list is manipulated each time PR_start is * called, so that probe just started is moved 1 position up in linkedlist. * This way frequently used probes are moved closer to the head of list, * reducing average overhead. * Note that all overhead is on the scale of one hundred of CPU clock * ticks, which on the scale of submicroseconds. Yet, to optimise really * fast and frequent sections of code, we want to reduce this overhead * to absolute minimum possible. * * For actual measurements, probe overhead cancels out mostly. Still, * do not take the measured times as facts, they should be viewed in * relative comparison to overall CPU time and on the same platform. * * Every 1 second, Event within squid is called that parses gathered * statistics of every probe, and accumulates that into historical * structures for last 1,5,30 secs, 1,5,30 mins, and 1,5 and 24 hours. * Each second active probe stats are reset, and only historical data * is presented in cachemgr output. * * Reading stats. * "Worst case" may be misleading. Anything can happen at any section * of code that could delay reaching to probe stop. For eg. system may * need to service interrupt routine, task switch could occur, or page * fault needs to be handled. In this sense, this is quite meaningless * metric. "Best case" shows fastest completion of probe section, and * is also somewhat useless, unless you know that amount of work is * constant. Best metric to watch is "average time" and total cumulated * time in given timeframe, which really show percentage of time spent * in given section of code, and its average completion time. This data * could be used to detect bottlenecks withing squid and optimise them. * * TOTALS are quite off reality. Its there just to summarise cumulative * times and percent column. Percent values over 100% shows that there * have been some probes nested into each other. * */ #include "profiling.h" #include #ifdef USE_XPROF_STATS #if HAVE_GNUMALLLOC_H #include #elif HAVE_MALLOC_H #include #endif #if HAVE_UNISTD_H #include #endif /* Exported Data */ TimersArray *xprof_Timers = NULL; /* Private stuff */ /* new stuff */ #define MAXSTACKDEPTH 512 struct _callstack_entry { int timer; /* index into timers array */ const char *name; hrtime_t start, stop, accum; }; struct _callstack_entry cstack[MAXSTACKDEPTH]; int cstack_head = 0; #if defined(_MSC_VER) /* Microsoft C Compiler ONLY */ static __inline void #else static inline void #endif xprof_update(xprof_stats_data * head) { if (head->delta < head->best) head->best = head->delta; if (head->worst < head->delta) head->worst = head->delta; head->summ += head->delta; head->count++; } static xprof_stats_data *xp_UNACCOUNTED; static int xprof_inited = 0; static void xprof_InitLib(void) { if (xprof_inited) return; xprof_Timers = calloc(XPROF_LAST + 2, sizeof(xprof_stats_node)); xprof_Timers[XPROF_PROF_UNACCOUNTED]->name = "PROF_UNACCOUNTED"; xprof_Timers[XPROF_PROF_UNACCOUNTED]->accu.start = get_tick(); xp_UNACCOUNTED = &xprof_Timers[XPROF_PROF_UNACCOUNTED]->accu; cstack_head = 0; xprof_inited = 1; } void xprof_start(xprof_type type, const char *timer) { hrtime_t tt = get_tick(); if (!xprof_inited) xprof_InitLib(); /* If nested, stop current stack frame */ if (cstack_head > 0) { cstack[cstack_head - 1].accum += get_tick() - cstack[cstack_head - 1].start; cstack[cstack_head - 1].start = -1; } /* Are we at the first level? If so; stop the unaccounted timer */ if (cstack_head == 0) { assert(xp_UNACCOUNTED->start != -1); xp_UNACCOUNTED->delta = tt - xp_UNACCOUNTED->start; xp_UNACCOUNTED->start = -1; xprof_update(xp_UNACCOUNTED); } /* Allocate new stack frame */ cstack[cstack_head].start = tt; cstack[cstack_head].stop = -1; cstack[cstack_head].accum = 0; cstack[cstack_head].timer = type; cstack[cstack_head].name = timer; cstack_head++; assert(cstack_head < MAXSTACKDEPTH); } void xprof_stop(xprof_type type, const char *timer) { hrtime_t tt = get_tick(); assert(cstack_head > 0); cstack_head--; assert(cstack[cstack_head].timer == type); /* Record timer details */ cstack[cstack_head].accum += tt - cstack[cstack_head].start; xprof_Timers[type]->accu.delta = cstack[cstack_head].accum; xprof_Timers[type]->name = timer; /* Update */ xprof_update(&xprof_Timers[type]->accu); /* Restart previous timer if we're not at the top level */ if (cstack_head > 0) { cstack[cstack_head - 1].start = tt; cstack[cstack_head - 1].stop = 0; return; } /* Get here? We're at the top level; unaccounted */ xp_UNACCOUNTED->start = tt; } #endif /* USE_XPROF_STATS */