Read Input Files
Provide efficient mechanisms for reading input files.
Notes
The basic scheme is to read the entire file into
virtual memory and then use pointers to extract
individual lines.
- Define default end of line character (eolchar)
- Constant definition
- Notes
-
While other concerns transform the eolchar to a variable,
the Input concern knows that it is a constant.
- Segment Source
-
73: /* The character marking end of line. Default to \n. */
74: int eolchar = '\n';
75:
- Declare line data structure (line)
-
Structure declaration
- Notes
-
The first field optimization concern (1stFldOptz) adds the
keybeg and keylim members.
They are not part of the basic concern.
- Segment Source
-
76: /* Lines are held in core as counted strings. */
77: struct line
78: {
79: char *text; /* Text of the line. */
80: int length; /* Length not including final newline. */
81: char *keybeg; /* Start of first key. */
82: char *keylim; /* Limit of first key. */
83: };
84:
- Declare lines data structure (lines)
-
Structure declaration
- Segment Source
- 85: /* Arrays of lines. */
86: struct lines
87: {
88: struct line *lines; /* Dynamically allocated array of lines. */
89: int used; /* Number of slots used. */
90: int alloc; /* Number of slots allocated. */
91: int limit; /* Max number of slots to allocate. */
92: };
93:
- Declare buffer data structure (buffer)
-
Structure declaration
- Segment Source
- 94: /* Input buffers. */
95: struct buffer
96: {
97: char *buf; /* Dynamically allocated buffer. */
98: int used; /* Number of bytes used. */
99: int alloc; /* Number of bytes allocated. */
100: int left; /* Number of bytes left after line parsing. */
101: };
102:
- Define size of sort buffer (sortalloc)
-
Constant definition
- Segment Source
-
165: /* Initial buffer size for in core sorting. Will not grow unless a
166: line longer than this is seen. */
167: static int sortalloc = 512 * 1024;
168:
- Define size of merge buffer (mergealloc)
- Constant definition
- Segment Source
-
169: /* Initial buffer size for in core merge buffers. Bear in mind that
170: up to NMERGE * mergealloc bytes may be allocated for merge buffers. */
171: static int mergealloc = 16 * 1024;
172:
- Define default line length (linelength)
- Constant definition
- Segment Source
-
173: /* Guess of average line length. */
174: static int linelength = 30;
175:
- Define maximum number of lines (LINEALLOC)
- Constant definition
- Segment Source
-
176: /* Maximum number of elements for the array(s) of struct line's, in bytes. */
177: #define LINEALLOC (256 * 1024)
178:
- Define initbuf()
-
Function definition
-
Initialization for line oriented input buffers
- Segment Source
-
470: /* Initialize BUF, allocating ALLOC bytes initially. */
471:
472: static void
473: initbuf (struct buffer *buf, int alloc)
474: {
475: buf->alloc = alloc;
476: buf->buf = xmalloc (buf->alloc);
477: buf->used = buf->left = 0;
478: }
479:
- Define fillbuf()
-
Function definition
-
Read entire file into a virtual memory.
- Segment Source
-
480: /* Fill BUF reading from FP, moving buf->left bytes from the end
481: of buf->buf to the beginning first. If EOF is reached and the
482: file wasn't terminated by a newline, supply one. Return a count
483: of bytes buffered. */
484:
485: static int
486: fillbuf (struct buffer *buf, FILE *fp)
487: {
488: int cc;
489:
490: memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
491: buf->used = buf->left;
492:
493: while (!feof (fp) && (buf->used == 0 || !memchr (buf->buf, eolchar, buf->used)))
494: {
495: if (buf->used == buf->alloc)
496: {
497: buf->alloc *= 2;
498: buf->buf = xrealloc (buf->buf, buf->alloc);
499: }
500: cc = fread (buf->buf + buf->used, 1, buf->alloc - buf->used, fp);
501: if (ferror (fp))
502: {
503: error (0, errno, _("read error"));
504: cleanup ();
505: exit (SORT_FAILURE);
506: }
507: buf->used += cc;
508: }
509:
510: if (feof (fp) && buf->used && buf->buf[buf->used - 1] != eolchar)
511: {
512: if (buf->used == buf->alloc)
513: {
514: buf->alloc *= 2;
515: buf->buf = xrealloc (buf->buf, buf->alloc);
516: }
517: buf->buf[buf->used++] = eolchar;
518: }
519:
520: return buf->used;
521: }
522:
- Define initlines()
-
Function definition
- Initialize the lines{} data structure.
- Segment Source
-
523: /* Initialize LINES, allocating space for ALLOC lines initially.
524: LIMIT is the maximum possible number of lines to allocate space
525: for, ever. */
526:
527: static void
528: initlines (struct lines *lines, int alloc, int limit)
529: {
530: lines->alloc = alloc;
531: lines->lines = (struct line *) xmalloc (lines->alloc * sizeof (struct line));
532: lines->used = 0;
533: lines->limit = limit;
534: }
535:
- Define findlines()
-
Function definition
-
Extract the next chunk of lines in the input buffer.
- Notes
-
This also translates eolchar to the null character ('\0').
(This is unnecessary, but cheap, if the alternate eolchar
is being used.)
The code from lines 715-745 is strictly extension code.
It is not part of the core capability.
- Segment Source
-
685: /* Find the lines in BUF, storing pointers and lengths in LINES.
686: Also replace newlines in BUF with NULs. */
687:
688: static void
689: findlines (struct buffer *buf, struct lines *lines)
690: {
691: register char *beg = buf->buf, *lim = buf->buf + buf->used, *ptr;
692: struct keyfield *key = keyhead.next;
693:
694: lines->used = 0;
695:
696: while (beg < lim && (ptr = memchr (beg, eolchar, lim - beg))
697: && lines->used < lines->limit)
698: {
699: /* There are various places in the code that rely on a NUL
700: being at the end of in-core lines; NULs inside the lines
701: will not cause trouble, though. */
702: *ptr = '\0';
703:
704: if (lines->used == lines->alloc)
705: {
706: lines->alloc *= 2;
707: lines->lines = (struct line *)
708: xrealloc ((char *) lines->lines,
709: lines->alloc * sizeof (struct line));
710: }
711:
712: lines->lines[lines->used].text = beg;
713: lines->lines[lines->used].length = ptr - beg;
714:
715: /* Precompute the position of the first key for efficiency. */
716: if (key)
717: {
718: if (key->eword >= 0)
719: lines->lines[lines->used].keylim =
720: limfield (&lines->lines[lines->used], key);
721: else
722: lines->lines[lines->used].keylim = ptr;
723:
724: if (key->sword >= 0)
725: lines->lines[lines->used].keybeg =
726: begfield (&lines->lines[lines->used], key);
727: else
728: {
729: if (key->skipsblanks)
730: while (blanks[UCHAR (*beg)])
731: ++beg;
732: lines->lines[lines->used].keybeg = beg;
733: }
734: if (key->skipeblanks)
735: {
736: trim_trailing_blanks (lines->lines[lines->used].keybeg,
737: &lines->lines[lines->used].keylim);
738: }
739: }
740: else
741: {
742: lines->lines[lines->used].keybeg = 0;
743: lines->lines[lines->used].keylim = 0;
744: }
745:
746: ++lines->used;
747: beg = ptr + 1;
748: }
749:
750: buf->left = lim - beg;
751: }
752:
- Input in check mode
- Code insertion
- Notes
-
Check Mode input and Sort Mode input are quite similar.
The both read files one at a time.
Check Mode refills at the bottom of the loop.
Segment Element
Variable declaration
-
1213: struct buffer buf; /* Input buffer. */
1214: struct lines lines; /* Lines scanned from the buffer. */
1215: struct line temp; /* Copy of previous line. */
Segment Element
Code insertionInitialize data structures
- 1219: initbuf (&buf, mergealloc);
1220: initlines (&lines, mergealloc / linelength + 1,
1221: LINEALLOC / ((NMERGE + NMERGE) * sizeof (struct line)));
Segment Element
Code insertionRead in a chuck of data.
- 1225: cc = fillbuf (&buf, fp);
1226: if (cc == 0)
1227: goto finish;
1228:
1229: findlines (&buf, &lines);
1230:
Segment Element
Code insertionRefill with next chuck of data.
-
1265: cc = fillbuf (&buf, fp);
1266: if (cc == 0)
1267: break;
1268:
1269: findlines (&buf, &lines);
- Input in merge mode
- Notes
-
Merge Mode input is specialized to reading multiple files.
Merge Mode refills at the bottom of the loop.
Segment Element
Array declaration
-
1294: struct buffer buffer[NMERGE]; /* Input buffers for each file. */
1295: struct lines lines[NMERGE]; /* Line tables for each buffer. */
Segment Element
Code insertion
Initialize data structures and read first chuck of data.
-
1320: initbuf (&buffer[i], mergealloc);
1321: /* If a file is empty, eliminate it from future consideration. */
1322: while (i < nfps && !fillbuf (&buffer[i], fps[i]))
1323: {
1324: xfclose (fps[i]);
1325: --nfps;
1326: for (j = i; j < nfps; ++j)
1327: fps[j] = fps[j + 1];
1328: }
1329: if (i == nfps)
1330: free (buffer[i].buf);
1331: else
1332: {
1333: initlines (&lines[i], mergealloc / linelength + 1,
1334: LINEALLOC / ((NMERGE + NMERGE) * sizeof (struct line)));
1335: findlines (&buffer[i], &lines[i]);
1336: cur[i] = 0;
1337: }
Segment Element
Code insertionRefill with next chuck of data.
-
1398: if (fillbuf (&buffer[ord[0]], fps[ord[0]]))
1399: {
1400: findlines (&buffer[ord[0]], &lines[ord[0]]);
1401: cur[ord[0]] = 0;
1402: }
- Input in sort mode
- Notes
-
Check Mode input and Sort Mode input are quite similar.
The both read files one at a time.
Sort Mode refills at the top of the loop.
Segment Element
Variable declaration
- 1558: struct buffer buf;
1559: struct lines lines;
Segment Element
Code insertionInitialize data structures
- 1567: initbuf (&buf, sortalloc);
1568: initlines (&lines, sortalloc / linelength + 1,
1569: LINEALLOC / sizeof (struct line));
Segment Element
Code insertionRead in a chuck of data.
-
1576: while (fillbuf (&buf, fp))
1577: {
1578: findlines (&buf, &lines);