There are three relatively common newline conventions:
,
, and
, and a fourth one that can occur when an editor gets confused about the newline convention,
. If an approach supports universal newlines, it supports all four simultaneously, even if fixed.
Reading files line by line with universal newline support is easy. The only problem is that interactive input from line-buffered sources looks like it is read one line late. To avoid that, one can read lines into a dynamic buffer up to, but not including the newline; and consume the newline when reading the next line. For example:
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <stdio.h>
ssize_t getline_universal(char **dataptr, size_t *sizeptr, FILE *in)
{
char *data = NULL;
size_t size = 0;
size_t used = 0;
int c;
if (!dataptr || !sizeptr || !in) {
errno = EINVAL;
return -1;
}
if (*sizeptr) {
data = *dataptr;
size = *sizeptr;
} else {
*dataptr = data;
*sizeptr = size;
}
/* Ensure there are at least 2 chars available. */
if (size < 2) {
size = 2;
data = malloc(size);
if (!data) {
errno = ENOMEM;
return -1;
}
*dataptr = data;
*sizeptr = size;
}
/* Consume leading newline. */
c = fgetc(in);
if (c == '
') {
c = fgetc(in);
if (c == '
')
c = fgetc(in);
} else
if (c == '
') {
c = fgetc(in);
if (c == '
')
c = fgetc(in);
}
/* No more data? */
if (c == EOF) {
data[used] = '';
errno = 0;
return -1;
}
while (c != '
' && c != '
' && c != EOF) {
if (used + 1 >= size) {
if (used < 7)
size = 8;
else
if (used < 1048576)
size = (3 * used) / 2;
else
size = (used | 1048575) + 1048577;
data = realloc(data, size);
if (!data) {
errno = ENOMEM;
return -1;
}
*dataptr = data;
*sizeptr = size;
}
data[used++] = c;
c = fgetc(in);
}
/* Terminate line. We know used < size. */
data[used] = '';
/* Do not consume the newline separator. */
if (c != EOF)
ungetc(c, in);
/* Done. */
errno = 0;
return used;
}
The above function works much like POSIX.1-2008 getline()
, except that it supports all four newline conventions (even mixed), and that it omits the newline from the line read. (That is, the newline is not included in either the return value or the dynamically allocated buffer. The newline is left in the stream, and consumed by the next getline_universal()
operation.)
Unlike standard functions, getline_universal()
always sets errno
: to zero if successful, and nonzero otherwise. If you don't like the behaviour, feel free to change that.
As an use case example:
int main(void)
{
unsigned long linenum = 0u;
char *line_buf = NULL;
size_t line_max = 0;
ssize_t line_len;
while (1) {
line_len = getline_universal(&line_buf, &line_max, stdin);
if (line_len < 0)
break;
linenum++;
printf("%lu: "%s" (%zd chars)
", linenum, line_buf, line_len);
fflush(stdout);
}
if (errno) {
fprintf(stderr, "Error reading from standard input: %s.
", strerror(errno));
return EXIT_FAILURE;
}
/* Not necessary before exiting, but here's how to
safely discard the line buffer: */
free(line_buf);
line_buf = NULL;
line_max = 0;
line_len = 0;
return EXIT_SUCCESS;
}
Note that because free(NULL)
is safe, you can discard the buffer (using free(line_buf); line_buf = NULL; line_max = 0;
) before any call to getline_universal(&line_buf, &line_max, stream)
.