Skip to content

Commit ef2fa3b

Browse files
committed
optimize lexer for faster lexing
1 parent 97b131c commit ef2fa3b

File tree

2 files changed

+118
-43
lines changed

2 files changed

+118
-43
lines changed

compiler/rustc_lexer/src/cursor.rs

Lines changed: 66 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ pub struct Cursor<'a> {
2121
pub(crate) const EOF_CHAR: char = '\0';
2222

2323
impl<'a> Cursor<'a> {
24+
#[inline]
2425
pub fn new(input: &'a str, frontmatter_allowed: FrontmatterAllowed) -> Cursor<'a> {
2526
Cursor {
2627
len_remaining: input.len(),
@@ -31,6 +32,7 @@ impl<'a> Cursor<'a> {
3132
}
3233
}
3334

35+
#[inline]
3436
pub fn as_str(&self) -> &'a str {
3537
self.chars.as_str()
3638
}
@@ -53,12 +55,14 @@ impl<'a> Cursor<'a> {
5355
/// If requested position doesn't exist, `EOF_CHAR` is returned.
5456
/// However, getting `EOF_CHAR` doesn't always mean actual end of file,
5557
/// it should be checked with `is_eof` method.
58+
#[inline]
5659
pub fn first(&self) -> char {
5760
// `.next()` optimizes better than `.nth(0)`
5861
self.chars.clone().next().unwrap_or(EOF_CHAR)
5962
}
6063

6164
/// Peeks the second symbol from the input stream without consuming it.
65+
#[inline]
6266
pub(crate) fn second(&self) -> char {
6367
// `.next()` optimizes better than `.nth(1)`
6468
let mut iter = self.chars.clone();
@@ -67,6 +71,7 @@ impl<'a> Cursor<'a> {
6771
}
6872

6973
/// Peeks the third symbol from the input stream without consuming it.
74+
#[inline]
7075
pub fn third(&self) -> char {
7176
// `.next()` optimizes better than `.nth(2)`
7277
let mut iter = self.chars.clone();
@@ -76,21 +81,25 @@ impl<'a> Cursor<'a> {
7681
}
7782

7883
/// Checks if there is nothing more to consume.
84+
#[inline]
7985
pub(crate) fn is_eof(&self) -> bool {
8086
self.chars.as_str().is_empty()
8187
}
8288

8389
/// Returns amount of already consumed symbols.
90+
#[inline]
8491
pub(crate) fn pos_within_token(&self) -> u32 {
8592
(self.len_remaining - self.chars.as_str().len()) as u32
8693
}
8794

8895
/// Resets the number of bytes consumed to 0.
96+
#[inline]
8997
pub(crate) fn reset_pos_within_token(&mut self) {
9098
self.len_remaining = self.chars.as_str().len();
9199
}
92100

93101
/// Moves to the next character.
102+
#[inline]
94103
pub(crate) fn bump(&mut self) -> Option<char> {
95104
let c = self.chars.next()?;
96105

@@ -102,24 +111,76 @@ impl<'a> Cursor<'a> {
102111
Some(c)
103112
}
104113

114+
#[inline]
115+
pub(crate) fn bump_if(&mut self, expected: char) -> bool {
116+
let mut chars = self.chars.clone();
117+
if chars.next() == Some(expected) {
118+
self.chars = chars;
119+
true
120+
} else {
121+
false
122+
}
123+
}
124+
125+
/// Bumps the cursor if the next character is either of the two expected characters.
126+
#[inline]
127+
pub(crate) fn bump_if2(&mut self, expected1: char, expected2: char) -> bool {
128+
let mut chars = self.chars.clone();
129+
if let Some(c) = chars.next()
130+
&& (c == expected1 || c == expected2)
131+
{
132+
self.chars = chars;
133+
return true;
134+
}
135+
false
136+
}
137+
105138
/// Moves to a substring by a number of bytes.
139+
#[inline]
106140
pub(crate) fn bump_bytes(&mut self, n: usize) {
107-
self.chars = self.as_str()[n..].chars();
141+
self.chars = self.as_str().get(n..).unwrap_or("").chars();
108142
}
109143

110144
/// Eats symbols while predicate returns true or until the end of file is reached.
145+
#[inline]
111146
pub(crate) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
112147
// It was tried making optimized version of this for eg. line comments, but
113148
// LLVM can inline all of this and compile it down to fast iteration over bytes.
114149
while predicate(self.first()) && !self.is_eof() {
115150
self.bump();
116151
}
117152
}
153+
/// Eats characters until the given byte is found.
154+
/// Returns true if the byte was found, false if end of file was reached.
155+
#[inline]
156+
pub(crate) fn eat_until(&mut self, byte: u8) -> bool {
157+
match memchr::memchr(byte, self.as_str().as_bytes()) {
158+
Some(index) => {
159+
self.bump_bytes(index);
160+
true
161+
}
162+
None => {
163+
self.chars = "".chars();
164+
false
165+
}
166+
}
167+
}
118168

119-
pub(crate) fn eat_until(&mut self, byte: u8) {
120-
self.chars = match memchr::memchr(byte, self.as_str().as_bytes()) {
121-
Some(index) => self.as_str()[index..].chars(),
122-
None => "".chars(),
169+
/// Eats characters until any of the given bytes is found, then consumes past it.
170+
/// Returns the found byte if any, or None if end of file was reached.
171+
#[inline]
172+
pub(crate) fn eat_past2(&mut self, byte1: u8, byte2: u8) -> Option<u8> {
173+
let bytes = self.as_str().as_bytes();
174+
match memchr::memchr2(byte1, byte2, bytes) {
175+
Some(index) => {
176+
let found = bytes[index];
177+
self.bump_bytes(index + 1);
178+
Some(found)
179+
}
180+
None => {
181+
self.chars = "".chars();
182+
None
183+
}
123184
}
124185
}
125186
}

compiler/rustc_lexer/src/lib.rs

Lines changed: 52 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -563,11 +563,30 @@ impl Cursor<'_> {
563563
self.eat_while(|ch| ch != '\n' && is_horizontal_whitespace(ch));
564564
let invalid_infostring = self.first() != '\n';
565565

566-
let mut found = false;
567-
let nl_fence_pattern = format!("\n{:-<1$}", "", length_opening as usize);
568-
if let Some(closing) = self.as_str().find(&nl_fence_pattern) {
566+
#[inline]
567+
fn find_closing_fence(s: &str, dash_count: usize) -> Option<usize> {
568+
let bytes = s.as_bytes();
569+
let mut i = 0;
570+
while i < bytes.len() {
571+
if let Some(newline_pos) = memchr::memchr(b'\n', &bytes[i..]) {
572+
i += newline_pos + 1;
573+
let start = i;
574+
if start + dash_count <= bytes.len() {
575+
let slice = &bytes[start..start + dash_count];
576+
if slice.iter().all(|&b| b == b'-') {
577+
return Some(start + dash_count);
578+
}
579+
}
580+
} else {
581+
break;
582+
}
583+
}
584+
None
585+
}
586+
587+
if let Some(closing) = find_closing_fence(self.as_str(), length_opening as usize) {
569588
// candidate found
570-
self.bump_bytes(closing + nl_fence_pattern.len());
589+
self.bump_bytes(closing);
571590
// in case like
572591
// ---cargo
573592
// --- blahblah
@@ -576,10 +595,7 @@ impl Cursor<'_> {
576595
// ----
577596
// combine those stuff into this frontmatter token such that it gets detected later.
578597
self.eat_until(b'\n');
579-
found = true;
580-
}
581-
582-
if !found {
598+
} else {
583599
// recovery strategy: a closing statement might have preceding whitespace/newline
584600
// but not have enough dashes to properly close. In this case, we eat until there,
585601
// and report a mismatch in the parser.
@@ -656,23 +672,25 @@ impl Cursor<'_> {
656672
};
657673

658674
let mut depth = 1usize;
659-
while let Some(c) = self.bump() {
675+
while let Some(c) = self.eat_past2(b'/', b'*') {
660676
match c {
661-
'/' if self.first() == '*' => {
662-
self.bump();
663-
depth += 1;
677+
b'/' => {
678+
if self.bump_if('*') {
679+
depth += 1;
680+
}
664681
}
665-
'*' if self.first() == '/' => {
666-
self.bump();
667-
depth -= 1;
668-
if depth == 0 {
669-
// This block comment is closed, so for a construction like "/* */ */"
670-
// there will be a successfully parsed block comment "/* */"
671-
// and " */" will be processed separately.
672-
break;
682+
b'*' => {
683+
if self.bump_if('/') {
684+
depth -= 1;
685+
if depth == 0 {
686+
// This block comment is closed, so for a construction like "/* */ */"
687+
// there will be a successfully parsed block comment "/* */"
688+
// and " */" will be processed separately.
689+
break;
690+
}
673691
}
674692
}
675-
_ => (),
693+
_ => unreachable!(),
676694
}
677695
}
678696

@@ -935,19 +953,21 @@ impl Cursor<'_> {
935953
/// if string is terminated.
936954
fn double_quoted_string(&mut self) -> bool {
937955
debug_assert!(self.prev() == '"');
938-
while let Some(c) = self.bump() {
956+
while let Some(c) = self.eat_past2(b'"', b'\\') {
939957
match c {
940-
'"' => {
958+
b'"' => {
941959
return true;
942960
}
943-
'\\' if self.first() == '\\' || self.first() == '"' => {
944-
// Bump again to skip escaped character.
945-
self.bump();
961+
b'\\' => {
962+
let first = self.first();
963+
if first == '\\' || first == '"' {
964+
// Bump to skip escaped character.
965+
self.bump();
966+
}
946967
}
947-
_ => (),
968+
_ => unreachable!(),
948969
}
949970
}
950-
// End of file reached.
951971
false
952972
}
953973

@@ -963,9 +983,8 @@ impl Cursor<'_> {
963983
debug_assert!(self.prev() != '#');
964984

965985
let mut n_start_hashes: u32 = 0;
966-
while self.first() == '#' {
986+
while self.bump_if('#') {
967987
n_start_hashes += 1;
968-
self.bump();
969988
}
970989

971990
if self.first() != '"' {
@@ -1025,9 +1044,8 @@ impl Cursor<'_> {
10251044

10261045
// Count opening '#' symbols.
10271046
let mut eaten = 0;
1028-
while self.first() == '#' {
1047+
while self.bump_if('#') {
10291048
eaten += 1;
1030-
self.bump();
10311049
}
10321050
let n_start_hashes = eaten;
10331051

@@ -1043,9 +1061,7 @@ impl Cursor<'_> {
10431061
// Skip the string contents and on each '#' character met, check if this is
10441062
// a raw string termination.
10451063
loop {
1046-
self.eat_until(b'"');
1047-
1048-
if self.is_eof() {
1064+
if !self.eat_until(b'"') {
10491065
return Err(RawStrError::NoTerminator {
10501066
expected: n_start_hashes,
10511067
found: max_hashes,
@@ -1117,9 +1133,7 @@ impl Cursor<'_> {
11171133
/// and returns false otherwise.
11181134
fn eat_float_exponent(&mut self) -> bool {
11191135
debug_assert!(self.prev() == 'e' || self.prev() == 'E');
1120-
if self.first() == '-' || self.first() == '+' {
1121-
self.bump();
1122-
}
1136+
self.bump_if2('-', '+');
11231137
self.eat_decimal_digits()
11241138
}
11251139

0 commit comments

Comments
 (0)