-------------------------------------------------- col_ed.c -------------------------------------------------- /** ** col_ed - column editor ** ** CREATED: 2014.05.20 ABS copied from parse_conway ** MODIFIED: 2014.05.21 ABS fleshed out ** MODIFIED: 2014.05.29 ABS added ScaleDate ** MODIFIED: 2014.06.03 ABS looking for date error: ** 7/27/2013 -> -89402112 instead of 1374883200 ** changed ScaleDate to DateDenom ** MODIFIED: 2014.07.01 ABS max commands up to 5000 ** MODIFIED: 2014.09.04 ABS added threshold ** **/ #include "col_ed.h" main(argc, argv) int argc; char** argv; { /************************************************ implemented: D - DATE I - INTEGER SUBSTITUTE P - PRE/POSTFIX R - ROW-BASED SUBSTITUTE S - SUBSTITUTE ideas: C Constrain - discard row outside range T Trim - force to range M Missing - by column L Log - of column - log(0) val F Floor - truncate to int R Round - to int N Nonnumeric - replace w/ val, del row or col Average - add col New - new row with avg + sum columns, add column Y Yank - delete column Q Quotes - add quotes to output G Global - substitute J Join - with col , sep P Product - add col * D Diff - two columns, add column * R Ratio - add column * W White - add noise, min/max M Mimic - dup column B Blot - replace w/ value E Extremes - force vals outside range O Outliers - delete row if outside range I Index - add index column H Histogram - balance ??? A Analyze - Write/Read stats # - remove comments ! - shell command & - logical and 2 cols, add new | - or ~ - not = - eq X - XOR + - * / % ************************************************/ /* declare functions */ int count_commas(); long long int date_to_epoch(); char *extract_filename(); char *replace(); /* declare local variables */ int len, numCols, newNumCols, current_command_row; int num_command_rows, num_command_columns; int current_data_row, temp_int_old, temp_int_mat; long long int llint0; int targetCol[MAX_COMMAND_ROWS]; float tmp_flt, min_flt, max_flt; int skipLine; /* Boolean */ char in_row[MAX_LINE]; char in_command_row[MAX_LINE]; char date_str[MAX_LINE]; char colArray[MAX_COLS][MAX_FIELD]; char commandArray[MAX_COMMAND_ROWS][4][MAX_FIELD]; char *tmp_str; /* somebody else mallocs */ char temp_string[MAX_FIELD]; FILE* command_fp; /* initialize global variables (including parameters) */ defaultParms(); parseArgs(argc, argv); ProgName = extract_filename(argv[0]); /* initialize local variables */ num_command_rows = lines_in_file(CommandFile); if (Debug != FALSE) { fprintf(stderr, "%s: DEBUG: Debug = %d\n", ProgName, Debug); fprintf(stderr, "%s: DEBUG: Scramble = %d\n", ProgName, Scramble); fprintf(stderr, "%s: DEBUG: CommandFile = %s\n", ProgName, CommandFile); fprintf(stderr, "%s: DEBUG: MissingValues = %s\n", ProgName, MissingValues); fprintf(stderr, "%s: DEBUG: DateDenom = %lld\n", ProgName, DateDenom); fprintf(stderr, "%s: DEBUG: num_command_rows = %d\n", ProgName, num_command_rows); fprintf(stderr, "%s: DEBUG: DateFormat = %s\n", ProgName, DateFormat); fflush(stderr); } /* open commands file */ command_fp = fopen(CommandFile, "r"); if (command_fp == NULL) { fprintf(stderr, "%s: ERROR: file open of %s failed, error: %s\n", ProgName, CommandFile, strerror(errno)); exit(0); } /* read commands file */ for (current_command_row = 0; current_command_row < num_command_rows; current_command_row++) { len = getl(command_fp, in_command_row, MAX_LINE); if (Debug != FALSE) { fprintf(stderr, "%s: DEBUG: command row %d: >%s<\n", ProgName, current_command_row, in_command_row); } if (len < 1) { fprintf(stderr, "%s: ERROR: command row %d has no data\n", ProgName, current_command_row); exit(0); } num_command_columns = 1 + count_commas(in_command_row); if (4 != num_command_columns) { fprintf(stderr, "%s: ERROR: command row %d has %d columns, should be 4\n", ProgName, current_command_row, num_command_columns); exit(0); } /* now load array */ /* note: command rows start at 0, columns at 1 */ parse_csv_line_to_string(in_command_row, 4, &commandArray[current_command_row][0][0]); targetCol[current_command_row] = atoi(commandArray[current_command_row][1]); if (Debug != FALSE) { fprintf(stderr, "%s: DEBUG: targetCol[%d]: %d\n", ProgName, current_command_row, targetCol[current_command_row]); int ccc; for (ccc = 1; ccc <= 4; ccc++) { fprintf(stderr, "%s: DEBUG: commandArray[%d][%d]: >%s<\n", ProgName, current_command_row, ccc, commandArray[current_command_row][ccc]); } } } /* read, process and write csv (stdin to stdout) */ current_data_row = 0; len = getl(stdin, in_row, MAX_LINE); /* trickier logic: we're reading stdin and can't count lines first */ while (len > 0) { int z; skipLine = FALSE; current_data_row++; if (current_data_row == 1) { numCols = 1 + count_commas(in_row); /* strip newline from in_row */ for (z = 0; z < MAX_LINE; z++) { if (in_row[z] == '\n' || in_row[z] == '\r') { in_row[z] = EOS; } } fprintf(stdout, "%s\r\n", in_row); len = getl(stdin, in_row, MAX_LINE); } else { newNumCols = 1 + count_commas(in_row); if (numCols != newNumCols) { fprintf(stderr, "%s: ERROR: row %d has %d columns, should be %d\n", ProgName, current_data_row - 1, newNumCols, numCols); exit(0); } else { int cc; if (Debug != FALSE) { fprintf(stderr, "%s: DEBUG: about to call parse_csv_line_to_string(); row: %d\n", ProgName, current_data_row); } parse_csv_line_to_string(in_row, numCols, colArray); /* process */ for (current_command_row = 0; current_command_row < num_command_rows; current_command_row++) { if (Debug != FALSE) { fprintf(stderr, "%s: DEBUG: commandArray[%d][2][0]: %c\n", ProgName, current_command_row, commandArray[current_command_row][2][0]); } switch (commandArray[current_command_row][2][0]) { /* DATE */ case 'D' : llint0 = date_to_epoch( colArray[targetCol[current_command_row]], DateFormat); llint0 = llint0/DateDenom; sprintf(temp_string, "%lld", llint0); if (Debug != FALSE) { fprintf(stderr, "%s: DEBUG: llint0: %lld ; temp_string: >%s<\n", ProgName, llint0, temp_string); } strcpy( colArray[targetCol[current_command_row]], temp_string); break; /* INTEGER SUBSTITUTE */ case 'I' : temp_int_old = atoi(colArray[targetCol[current_command_row]]); temp_int_mat = atoi(commandArray[current_command_row][3]); if (temp_int_old == temp_int_mat) { /* overwrite the whole string */ strcpy( colArray[targetCol[current_command_row]], commandArray[current_command_row][4]); } break; /* PRE/POSTFIX */ case 'P' : if (atoi(commandArray [current_command_row][3]) == 0) { /* PREFIX */ sprintf(temp_string, "%s%s", commandArray [current_command_row][4], colArray [targetCol [current_command_row]]); strcpy( colArray [targetCol [current_command_row]], temp_string); if (Debug != FALSE) { fprintf(stderr, "%s: DEBUG: Pr at row:%d; temp_string: %s\n", ProgName, current_data_row, temp_string); } } else { /* POSTFIX */ sprintf(temp_string, "%s%s", colArray [targetCol [current_command_row]], commandArray [current_command_row][4]); strcpy( colArray [targetCol [current_command_row]], temp_string); if (Debug != FALSE) { fprintf(stderr, "%s: DEBUG: Po at row:%d; temp_string: %s\n", ProgName, current_data_row, temp_string); } } break; /* ROW-BASED SUBSTITUTE */ case 'R' : if (current_data_row == atoi(commandArray [current_command_row][3])) { strcpy( colArray [targetCol [current_command_row]], commandArray [current_command_row][4]); } break; /* SUBSTITUTE */ case 'S' : tmp_str = replace( colArray[targetCol[current_command_row]], commandArray[current_command_row][3], commandArray[current_command_row][4]); strcpy( colArray[targetCol[current_command_row]], tmp_str); break; /* THRESHOLD */ case 'T' : tmp_flt = atof( colArray[targetCol[current_command_row]]); min_flt = atof( commandArray[current_command_row][3]); max_flt = atof( commandArray[current_command_row][4]); if ((tmp_flt < min_flt) || (tmp_flt > max_flt)) { skipLine = TRUE; } break; default: fprintf(stderr, "%s: ERROR: illegal command: %c\n", ProgName, commandArray[current_command_row][2][0]); exit(0); } } /* write */ if (skipLine == FALSE) { if (Debug != FALSE) { fprintf(stderr, "%s: DEBUG: about to write row: %d\n", ProgName, current_data_row); } for (cc = 1; cc < numCols; cc++) { fprintf(stdout, "%s,", colArray[cc]); } fprintf(stdout, "%s\r\n", colArray[cc]); } else { if (Debug != FALSE) { fprintf(stderr, "%s: DEBUG: about to skip row: %d\n", ProgName, current_data_row); } } /* fetch for next iteration */ len = getl(stdin, in_row, MAX_LINE); } } } #ifdef OOPS current_data_row--; /* we counted the failure to read */ #endif if (Debug != FALSE) { fprintf(stderr, "%s: DEBUG: total data rows: %d\n", ProgName, current_data_row); } } /* end main */ /* * parse_csv_line_to_string() * */ parse_csv_line_to_string(src_line, num_cols, dest_str_array) char* src_line; int num_cols; char dest_str_array[MAX_COLS][MAX_FIELD]; { char single_char; int col_num = 1; int in_char_num = 0; int out_char_num = 0; if (Debug != FALSE) { fprintf(stderr, "%s: parse_csv_line_to_string(): DEBUG: src_line: >%s<\n", ProgName, src_line); } while (col_num <= num_cols) { single_char = src_line[in_char_num]; while ((single_char != ',') && (single_char != EOS) && (single_char != '\n') && (single_char != '\r')) { dest_str_array[col_num][out_char_num++] = single_char; single_char = src_line[++in_char_num]; } dest_str_array[col_num][out_char_num] = EOS; if (dest_str_array[col_num][0] == EOS) { strcpy(&dest_str_array[col_num][0], MissingValues); } if (Debug != FALSE) { fprintf(stderr, "%s: DEBUG: parse_csv_line_to_string: dest_str_array[%d]: >%s<\n", ProgName, col_num, dest_str_array[col_num]); } /* in_char_num now points to first input comma */ #ifdef SUPER_DEBUG if (Debug != FALSE) { fprintf(stderr, "DEBUG: parse_csv_line_to_string(): src_line[%d] = %c (should be comma)\n", in_char_num, src_line[in_char_num]); } #endif in_char_num++; out_char_num = 0; col_num++; } } /* end parse_csv_line_to_string() */ /* * convertToUpper() - convert string to uppercase (from Stackoverflow) */ void convertToUpper(char *text, char *nText){ int col; for(col=0; col<=strlen(text); col++){ if( (text[col] > 96 ) && (text[col] < 123) ) // is the char lower case nText[col] = text[col] - 'a' + 'A'; //make upper else nText[col] = text[col]; //do nothing } } /* * defaultParms() - initialize global variables */ defaultParms() { Debug = FALSE; Scramble = FALSE; ColsOut[0] = 0; DateDenom = 60.0*60.0*24.0; strcpy(CommandFile, "commands.txt"); strcpy(DateFormat, "%m/%d/%Y %H:%M:%S"); strcpy(MissingValues, "99999"); } /* end defaultParms() */ /* * parseArgs() - parse arguments into global variables */ parseArgs(argc, argv) int argc; char** argv; { int i, c; for (i = 0; i < argc; i++) { if (argv[i][0] == '-') { switch (argv[i][1]) { case 'C' : ColsOut[0] = atoi(&argv[++i][0]); Scramble = (ColsOut[0] != 0); if (Scramble) { for (c = 1; c <= ColsOut[0]; c++) { ColsOut[c] = atoi(&argv[++i][0]); } } break; case 'D' : Debug = TRUE; break; case 'd' : strcpy(DateFormat, &argv[++i][0]); break; case 'f' : strcpy(CommandFile, &argv[++i][0]); break; case 'h' : help(argv); exit(0); break; case 'm' : strcpy(MissingValues, &argv[++i][0]); break; case 'S' : DateDenom = atof(&argv[++i][0]); break; case 'u' : usage(argv); exit(0); break; default: fprintf(stderr, "%s: ERROR: illegal flag: %s\n", ProgName, argv[i]); usage(argv); exit(0); break; } /* end switch */ } /* end if */ } /* end for i */ } /* end parseArgs() */ /* * getl() - modified from Kernighan & Ritchie to take stream */ int getl(stream, s, lim) /* get line into s, return length */ FILE *stream; char s[]; int lim; { int c, i; i = 0; while (--lim > 0 && (c = getc(stream)) != EOF && c != '\n') s[i++] = c; if (c == '\n') s[i++] = c; s[i] = '\0'; return(i); } /* end getl() */ /* * date_to_epoch() */ long long int date_to_epoch(date_string, format_string) char * date_string; char * format_string; { struct tm tm; time_t t; if (strptime(date_string, format_string, &tm) == NULL) /* Handle error */ return(-1); tm.tm_isdst = -1; /* Not set by strptime(); tells mktime() to determine whether daylight saving time is in effect */ t = mktime(&tm); if (t == -1) /* Handle error */ return(-1); if (Debug != FALSE) { fprintf(stderr, "%s: date_to_epoch(): DEBUG: date_string = %s; t = %lld\n", ProgName, date_string, (long long int)t); } return((long long int) t); } /* * lines_in_file() - from Stacktrace */ int lines_in_file(filename) char filename[]; { FILE *fp; /* */ int c; /* Nb. int (not char) for the EOF */ unsigned long newline_count = 0; /* count the newline characters */ fp = fopen(filename, "r"); if (fp == NULL) { fprintf(stderr, "%s: ERROR: file open of %s failed, error: %s\n", ProgName, filename, strerror(errno)); exit(0); } while ( (c=fgetc(fp)) != EOF ) { if ( c == '\n' ) { newline_count++; } } fclose(fp); return newline_count; } /* end lines_in_file() */ /* * extract_filename() */ char * extract_filename(char *str) { int ch = '/'; size_t len; char *pdest; char *inpfile = NULL; // Search backwards for last backslash in filepath pdest = strrchr(str, ch); // if backslash not found in filepath if(pdest == NULL ) { printf( "Result:\t%c not found\n", ch ); pdest = str; // The whole name is a file in current path? } else { pdest++; // Skip the backslash itself. } // extract filename from file path len = strlen(pdest); inpfile = malloc(len+1); // Make space for the zero. strncpy(inpfile, pdest, len+1); // Copy including zero. return inpfile; } /* * count_commas() */ int count_commas(char *str) { char single_char; int char_position = 0; int comma_count = 0; single_char = str[char_position++]; while (single_char != EOS) { if (single_char == ',') { comma_count++; } single_char = str[char_position++]; } return(comma_count); } /* * replace () * * from: bytes.com/topic/c/answers/223500-how-replace-substring-string-using-c * */ #include #include #include char *replace(const char *s, const char *old, const char *new) { char *ret; int i, count = 0; size_t newlen = strlen(new); size_t oldlen = strlen(old); for (i = 0; s[i] != '\0'; i++) { if (strstr(&s[i], old) == &s[i]) { count++; i += oldlen - 1; } } ret = malloc(i + count * (newlen - oldlen)); if (ret == NULL) exit(EXIT_FAILURE); i = 0; while (*s) { if (strstr(s, old) == s) { strcpy(&ret[i], new); i += newlen; s += oldlen; } else ret[i++] = *s++; } ret[i] = '\0'; return ret; } #ifdef UNIT_TEST int main(void) { char mystr[] = "##this is##a examp#le"; char *newstr = NULL; puts(mystr); newstr = replace(mystr, "##", "****"); printf("%s\n", newstr); free(newstr); return 0; } #endif -------------------------------------------------- col_ed.h -------------------------------------------------- /* * col_ed.h -- include file for col_ed.c */ /* includes */ #include #include #include #include #include #include #include /* constants */ #define TRUE (1) #define FALSE (0) #define MAX_LINE (8192) #define MAX_COLS (1024) #define MAX_COMMAND_ROWS (5000) #define MAX_FIELD (64) #define EOS (0) #define STRING_EQUAL (0) #define ERROR_CODE (-1) /* global variables */ char *ProgName; /* name of this program -- malloced elsewhere */ /* flag-settable parameters (global variables) */ int Debug; /* Boolean: TRUE = debug writes */ int Scramble; /* Boolean */ int ColsOut[MAX_COLS]; /* remaps columns for output */ char CommandFile[MAX_LINE]; /* filename */ char DateFormat[MAX_LINE]; /* */ char MissingValues[MAX_FIELD]; /* */ long long int DateDenom; /* */ /* macros */ -------------------------------------------------- Makefile -------------------------------------------------- O_FILES = col_ed.o usage.o help.o CFLAGS = -lc -g -lm CC = gcc col_ed: $(O_FILES) $(CC) $(CFLAGS) -o col_ed $(O_FILES) -------------------------------------------------- Test -------------------------------------------------- # /home/Alan/swdev/C/col_ed/col_ed -D -m -666 -S 1.0 < in.csv > out.csv # /home/Alan/swdev/C/col_ed/col_ed -D -d "%Y%m%d%H%M%S" -m -666 -S 1.0 < in2.csv > out2.csv -------------------------------------------------- commands.txt -------------------------------------------------- 1,D,FOO,BAR 2,S,LARCH,0 2,S,STARLING,1 3,T,40.0,80.0 4,P,0,Mr. 4,P,1,, Esq. -------------------------------------------------- in.csv -------------------------------------------------- head_1,head_2,head_3,head_4 12/6/2001 12:33:45,STARLING,77.77,0 2/2/2002 12:12:12,LARCH,66.66,1 3/3/2003 13:13:13,,55.55,2 4/21/2014 21:03:01,0,0,3 -------------------------------------------------- out.csv -------------------------------------------------- head_1,head_2,head_3,head_4 1007670825,1,77.77,Mr. 0, Esq. 1012680732,0,66.66,Mr. 1, Esq. 1046725993,-666,55.55,Mr. 2, Esq. --------------------------------------------------