| // Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
| // |
| // Use of this source code is governed by a BSD-style license |
| // that can be found in the LICENSE file in the root of the source |
| // tree. An additional intellectual property rights grant can be found |
| // in the file PATENTS. All contributing project authors may |
| // be found in the AUTHORS file in the root of the source tree. |
| |
| #include "./webvttparser.h" // NOLINT |
| #include <climits> |
| |
| namespace libwebvtt { |
| |
| // NOLINT'ing this enum because clang-format puts it in a single line which |
| // makes it look really unreadable. |
| enum { |
| kNUL = '\x00', |
| kSPACE = ' ', |
| kTAB = '\x09', |
| kLF = '\x0A', |
| kCR = '\x0D' |
| }; // NOLINT |
| |
| Reader::~Reader() {} |
| |
| LineReader::~LineReader() {} |
| |
| int LineReader::GetLine(std::string* line_ptr) { |
| if (line_ptr == NULL) |
| return -1; |
| |
| std::string& ln = *line_ptr; |
| ln.clear(); |
| |
| // Consume characters from the stream, until we |
| // reach end-of-line (or end-of-stream). |
| |
| // The WebVTT spec states that lines may be |
| // terminated in any of these three ways: |
| // LF |
| // CR |
| // CR LF |
| |
| // We interrogate each character as we read it from the stream. |
| // If we detect an end-of-line character, we consume the full |
| // end-of-line indication, and we're done; otherwise, accumulate |
| // the character and repeat. |
| |
| for (;;) { |
| char c; |
| const int e = GetChar(&c); |
| |
| if (e < 0) // error |
| return e; |
| |
| if (e > 0) // EOF |
| return (ln.empty()) ? 1 : 0; |
| |
| // We have a character, so we must first determine |
| // whether we have reached end-of-line. |
| |
| if (c == kLF) |
| return 0; // handle the easy end-of-line case immediately |
| |
| if (c == kCR) |
| break; // handle the hard end-of-line case outside of loop |
| |
| if (c == '\xFE' || c == '\xFF') // not UTF-8 |
| return -1; |
| |
| // To defend against pathological or malicious streams, we |
| // cap the line length at some arbitrarily-large value: |
| enum { kMaxLineLength = 10000 }; // arbitrary |
| |
| if (ln.length() >= kMaxLineLength) |
| return -1; |
| |
| // We don't have an end-of-line character, so accumulate |
| // the character in our line buffer. |
| ln.push_back(c); |
| } |
| |
| // We detected a CR. We must interrogate the next character |
| // in the stream, to determine whether we have a LF (which |
| // would make it part of this same line). |
| |
| char c; |
| const int e = GetChar(&c); |
| |
| if (e < 0) // error |
| return e; |
| |
| if (e > 0) // EOF |
| return 0; |
| |
| // If next character in the stream is not a LF, return it |
| // to the stream (because it's part of the next line). |
| if (c != kLF) |
| UngetChar(c); |
| |
| return 0; |
| } |
| |
| Parser::Parser(Reader* r) : reader_(r), unget_(-1) {} |
| |
| Parser::~Parser() {} |
| |
| int Parser::Init() { |
| int e = ParseBOM(); |
| |
| if (e < 0) // error |
| return e; |
| |
| if (e > 0) // EOF |
| return -1; |
| |
| // Parse "WEBVTT". We read from the stream one character at-a-time, in |
| // order to defend against non-WebVTT streams (e.g. binary files) that don't |
| // happen to comprise lines of text demarcated with line terminators. |
| |
| const char kId[] = "WEBVTT"; |
| |
| for (const char* p = kId; *p; ++p) { |
| char c; |
| e = GetChar(&c); |
| |
| if (e < 0) // error |
| return e; |
| |
| if (e > 0) // EOF |
| return -1; |
| |
| if (c != *p) |
| return -1; |
| } |
| |
| std::string line; |
| |
| e = GetLine(&line); |
| |
| if (e < 0) // error |
| return e; |
| |
| if (e > 0) // EOF |
| return 0; // weird but valid |
| |
| if (!line.empty()) { |
| // Parse optional characters that follow "WEBVTT" |
| |
| const char c = line[0]; |
| |
| if (c != kSPACE && c != kTAB) |
| return -1; |
| } |
| |
| // The WebVTT spec requires that the "WEBVTT" line |
| // be followed by an empty line (to separate it from |
| // first cue). |
| |
| e = GetLine(&line); |
| |
| if (e < 0) // error |
| return e; |
| |
| if (e > 0) // EOF |
| return 0; // weird but we allow it |
| |
| if (!line.empty()) |
| return -1; |
| |
| return 0; // success |
| } |
| |
| int Parser::Parse(Cue* cue) { |
| if (cue == NULL) |
| return -1; |
| |
| // Parse first non-blank line |
| |
| std::string line; |
| int e; |
| |
| for (;;) { |
| e = GetLine(&line); |
| |
| if (e) // EOF is OK here |
| return e; |
| |
| if (!line.empty()) |
| break; |
| } |
| |
| // A WebVTT cue comprises an optional cue identifier line followed |
| // by a (non-optional) timings line. You determine whether you have |
| // a timings line by scanning for the arrow token, the lexeme of which |
| // may not appear in the cue identifier line. |
| |
| const char kArrow[] = "-->"; |
| std::string::size_type arrow_pos = line.find(kArrow); |
| |
| if (arrow_pos != std::string::npos) { |
| // We found a timings line, which implies that we don't have a cue |
| // identifier. |
| |
| cue->identifier.clear(); |
| } else { |
| // We did not find a timings line, so we assume that we have a cue |
| // identifier line, and then try again to find the cue timings on |
| // the next line. |
| |
| cue->identifier.swap(line); |
| |
| e = GetLine(&line); |
| |
| if (e < 0) // error |
| return e; |
| |
| if (e > 0) // EOF |
| return -1; |
| |
| arrow_pos = line.find(kArrow); |
| |
| if (arrow_pos == std::string::npos) // not a timings line |
| return -1; |
| } |
| |
| e = ParseTimingsLine(&line, arrow_pos, &cue->start_time, &cue->stop_time, |
| &cue->settings); |
| |
| if (e) // error |
| return e; |
| |
| // The cue payload comprises all the non-empty |
| // lines that follow the timings line. |
| |
| Cue::payload_t& p = cue->payload; |
| p.clear(); |
| |
| for (;;) { |
| e = GetLine(&line); |
| |
| if (e < 0) // error |
| return e; |
| |
| if (line.empty()) |
| break; |
| |
| p.push_back(line); |
| } |
| |
| if (p.empty()) |
| return -1; |
| |
| return 0; // success |
| } |
| |
| int Parser::GetChar(char* c) { |
| if (unget_ >= 0) { |
| *c = static_cast<char>(unget_); |
| unget_ = -1; |
| return 0; |
| } |
| |
| return reader_->GetChar(c); |
| } |
| |
| void Parser::UngetChar(char c) { unget_ = static_cast<unsigned char>(c); } |
| |
| int Parser::ParseBOM() { |
| // Explanation of UTF-8 BOM: |
| // http://en.wikipedia.org/wiki/Byte_order_mark |
| |
| static const char BOM[] = "\xEF\xBB\xBF"; // UTF-8 BOM |
| |
| for (int i = 0; i < 3; ++i) { |
| char c; |
| int e = GetChar(&c); |
| |
| if (e < 0) // error |
| return e; |
| |
| if (e > 0) // EOF |
| return 1; |
| |
| if (c != BOM[i]) { |
| if (i == 0) { // we don't have a BOM |
| UngetChar(c); |
| return 0; // success |
| } |
| |
| // We started a BOM, so we must finish the BOM. |
| return -1; // error |
| } |
| } |
| |
| return 0; // success |
| } |
| |
| int Parser::ParseTimingsLine(std::string* line_ptr, |
| std::string::size_type arrow_pos, Time* start_time, |
| Time* stop_time, Cue::settings_t* settings) { |
| if (line_ptr == NULL) |
| return -1; |
| |
| std::string& line = *line_ptr; |
| |
| if (arrow_pos == std::string::npos || arrow_pos >= line.length()) |
| return -1; |
| |
| // Place a NUL character at the start of the arrow token, in |
| // order to demarcate the start time from remainder of line. |
| line[arrow_pos] = kNUL; |
| std::string::size_type idx = 0; |
| |
| int e = ParseTime(line, &idx, start_time); |
| if (e) // error |
| return e; |
| |
| // Detect any junk that follows the start time, |
| // but precedes the arrow symbol. |
| |
| while (char c = line[idx]) { |
| if (c != kSPACE && c != kTAB) |
| return -1; |
| ++idx; |
| } |
| |
| // Place a NUL character at the end of the line, |
| // so the scanner has a place to stop, and begin |
| // the scan just beyond the arrow token. |
| |
| line.push_back(kNUL); |
| idx = arrow_pos + 3; |
| |
| e = ParseTime(line, &idx, stop_time); |
| if (e) // error |
| return e; |
| |
| e = ParseSettings(line, idx, settings); |
| if (e) // error |
| return e; |
| |
| return 0; // success |
| } |
| |
| int Parser::ParseTime(const std::string& line, std::string::size_type* idx_ptr, |
| Time* time) { |
| if (idx_ptr == NULL) |
| return -1; |
| |
| std::string::size_type& idx = *idx_ptr; |
| |
| if (idx == std::string::npos || idx >= line.length()) |
| return -1; |
| |
| if (time == NULL) |
| return -1; |
| |
| // Consume any whitespace that precedes the timestamp. |
| |
| while (char c = line[idx]) { |
| if (c != kSPACE && c != kTAB) |
| break; |
| ++idx; |
| } |
| |
| // WebVTT timestamp syntax comes in three flavors: |
| // SS[.sss] |
| // MM:SS[.sss] |
| // HH:MM:SS[.sss] |
| |
| // Parse a generic number value. We don't know which component |
| // of the time we have yet, until we do more parsing. |
| |
| int val = ParseNumber(line, &idx); |
| |
| if (val < 0) // error |
| return val; |
| |
| Time& t = *time; |
| |
| // The presence of a colon character indicates that we have |
| // an [HH:]MM:SS style syntax. |
| |
| if (line[idx] == ':') { |
| // We have either HH:MM:SS or MM:SS |
| |
| // The value we just parsed is either the hours or minutes. |
| // It must be followed by another number value (that is |
| // either minutes or seconds). |
| |
| const int first_val = val; |
| |
| ++idx; // consume colon |
| |
| // Parse second value |
| |
| val = ParseNumber(line, &idx); |
| |
| if (val < 0) |
| return val; |
| |
| if (val >= 60) // either MM or SS |
| return -1; |
| |
| if (line[idx] == ':') { |
| // We have HH:MM:SS |
| |
| t.hours = first_val; |
| t.minutes = val; // vetted above |
| |
| ++idx; // consume MM:SS colon |
| |
| // We have parsed the hours and minutes. |
| // We must now parse the seconds. |
| |
| val = ParseNumber(line, &idx); |
| |
| if (val < 0) |
| return val; |
| |
| if (val >= 60) // SS part of HH:MM:SS |
| return -1; |
| |
| t.seconds = val; |
| } else { |
| // We have MM:SS |
| |
| // The implication here is that the hour value was omitted |
| // from the timestamp (because it was 0). |
| |
| if (first_val >= 60) // minutes |
| return -1; |
| |
| t.hours = 0; |
| t.minutes = first_val; |
| t.seconds = val; // vetted above |
| } |
| } else { |
| // We have SS (only) |
| |
| // The time is expressed as total number of seconds, |
| // so the seconds value has no upper bound. |
| |
| t.seconds = val; |
| |
| // Convert SS to HH:MM:SS |
| |
| t.minutes = t.seconds / 60; |
| t.seconds -= t.minutes * 60; |
| |
| t.hours = t.minutes / 60; |
| t.minutes -= t.hours * 60; |
| } |
| |
| // We have parsed the hours, minutes, and seconds. |
| // We must now parse the milliseconds. |
| |
| char c = line[idx]; |
| |
| // TODO(matthewjheaney): one option here is to slightly relax the |
| // syntax rules for WebVTT timestamps, to permit the comma character |
| // to also be used as the seconds/milliseconds separator. This |
| // would handle streams that use localization conventions for |
| // countries in Western Europe. For now we obey the rules specified |
| // in the WebVTT spec (allow "full stop" only). |
| |
| const bool have_milliseconds = (c == '.'); |
| |
| if (!have_milliseconds) { |
| t.milliseconds = 0; |
| } else { |
| ++idx; // consume FULL STOP |
| |
| val = ParseNumber(line, &idx); |
| |
| if (val < 0) |
| return val; |
| |
| if (val >= 1000) |
| return -1; |
| |
| if (val < 10) |
| t.milliseconds = val * 100; |
| else if (val < 100) |
| t.milliseconds = val * 10; |
| else |
| t.milliseconds = val; |
| } |
| |
| // We have parsed the time proper. We must check for any |
| // junk that immediately follows the time specifier. |
| |
| c = line[idx]; |
| |
| if (c != kNUL && c != kSPACE && c != kTAB) |
| return -1; |
| |
| return 0; // success |
| } |
| |
| int Parser::ParseSettings(const std::string& line, std::string::size_type idx, |
| Cue::settings_t* settings) { |
| settings->clear(); |
| |
| if (idx == std::string::npos || idx >= line.length()) |
| return -1; |
| |
| for (;;) { |
| // We must parse a line comprising a sequence of 0 or more |
| // NAME:VALUE pairs, separated by whitespace. The line iself is |
| // terminated with a NUL char (indicating end-of-line). |
| |
| for (;;) { |
| const char c = line[idx]; |
| |
| if (c == kNUL) // end-of-line |
| return 0; // success |
| |
| if (c != kSPACE && c != kTAB) |
| break; |
| |
| ++idx; // consume whitespace |
| } |
| |
| // We have consumed the whitespace, and have not yet reached |
| // end-of-line, so there is something on the line for us to parse. |
| |
| settings->push_back(Setting()); |
| Setting& s = settings->back(); |
| |
| // Parse the NAME part of the settings pair. |
| |
| for (;;) { |
| const char c = line[idx]; |
| |
| if (c == ':') // we have reached end of NAME part |
| break; |
| |
| if (c == kNUL || c == kSPACE || c == kTAB) |
| return -1; |
| |
| s.name.push_back(c); |
| |
| ++idx; |
| } |
| |
| if (s.name.empty()) |
| return -1; |
| |
| ++idx; // consume colon |
| |
| // Parse the VALUE part of the settings pair. |
| |
| for (;;) { |
| const char c = line[idx]; |
| |
| if (c == kNUL || c == kSPACE || c == kTAB) |
| break; |
| |
| if (c == ':') // suspicious when part of VALUE |
| return -1; // TODO(matthewjheaney): verify this behavior |
| |
| s.value.push_back(c); |
| |
| ++idx; |
| } |
| |
| if (s.value.empty()) |
| return -1; |
| } |
| } |
| |
| int Parser::ParseNumber(const std::string& line, |
| std::string::size_type* idx_ptr) { |
| if (idx_ptr == NULL) |
| return -1; |
| |
| std::string::size_type& idx = *idx_ptr; |
| |
| if (idx == std::string::npos || idx >= line.length()) |
| return -1; |
| |
| if (!isdigit(line[idx])) |
| return -1; |
| |
| int result = 0; |
| |
| while (isdigit(line[idx])) { |
| const char c = line[idx]; |
| const int i = c - '0'; |
| |
| if (result > INT_MAX / 10) |
| return -1; |
| |
| result *= 10; |
| |
| if (result > INT_MAX - i) |
| return -1; |
| |
| result += i; |
| |
| ++idx; |
| } |
| |
| return result; |
| } |
| |
| bool Time::operator==(const Time& rhs) const { |
| if (hours != rhs.hours) |
| return false; |
| |
| if (minutes != rhs.minutes) |
| return false; |
| |
| if (seconds != rhs.seconds) |
| return false; |
| |
| return (milliseconds == rhs.milliseconds); |
| } |
| |
| bool Time::operator<(const Time& rhs) const { |
| if (hours < rhs.hours) |
| return true; |
| |
| if (hours > rhs.hours) |
| return false; |
| |
| if (minutes < rhs.minutes) |
| return true; |
| |
| if (minutes > rhs.minutes) |
| return false; |
| |
| if (seconds < rhs.seconds) |
| return true; |
| |
| if (seconds > rhs.seconds) |
| return false; |
| |
| return (milliseconds < rhs.milliseconds); |
| } |
| |
| bool Time::operator>(const Time& rhs) const { return rhs.operator<(*this); } |
| |
| bool Time::operator<=(const Time& rhs) const { return !this->operator>(rhs); } |
| |
| bool Time::operator>=(const Time& rhs) const { return !this->operator<(rhs); } |
| |
| presentation_t Time::presentation() const { |
| const presentation_t h = 1000LL * 3600LL * presentation_t(hours); |
| const presentation_t m = 1000LL * 60LL * presentation_t(minutes); |
| const presentation_t s = 1000LL * presentation_t(seconds); |
| const presentation_t result = h + m + s + milliseconds; |
| return result; |
| } |
| |
| Time& Time::presentation(presentation_t d) { |
| if (d < 0) { // error |
| hours = 0; |
| minutes = 0; |
| seconds = 0; |
| milliseconds = 0; |
| |
| return *this; |
| } |
| |
| seconds = static_cast<int>(d / 1000); |
| milliseconds = static_cast<int>(d - 1000 * seconds); |
| |
| minutes = seconds / 60; |
| seconds -= 60 * minutes; |
| |
| hours = minutes / 60; |
| minutes -= 60 * hours; |
| |
| return *this; |
| } |
| |
| Time& Time::operator+=(presentation_t rhs) { |
| const presentation_t d = this->presentation(); |
| const presentation_t dd = d + rhs; |
| this->presentation(dd); |
| return *this; |
| } |
| |
| Time Time::operator+(presentation_t d) const { |
| Time t(*this); |
| t += d; |
| return t; |
| } |
| |
| Time& Time::operator-=(presentation_t d) { return this->operator+=(-d); } |
| |
| presentation_t Time::operator-(const Time& t) const { |
| const presentation_t rhs = t.presentation(); |
| const presentation_t lhs = this->presentation(); |
| const presentation_t result = lhs - rhs; |
| return result; |
| } |
| |
| } // namespace libwebvtt |