1use crate::error::Location;
4use cranelift_codegen::ir::types;
5use cranelift_codegen::ir::{Block, Value};
6use std::str::CharIndices;
7use std::u16;
8
9#[derive(Debug, PartialEq, Eq, Clone, Copy)]
14pub enum Token<'a> {
15 Comment(&'a str),
16 LPar, RPar, LBrace, RBrace, LBracket, RBracket, Minus, Plus, Multiply, Comma, Dot, Colon, Equal, Bang, At, Arrow, Float(&'a str), Integer(&'a str), Type(types::Type), DynamicType(u32), Value(Value), Block(Block), Cold, StackSlot(u32), DynamicStackSlot(u32), GlobalValue(u32), MemoryType(u32), Constant(u32), FuncRef(u32), SigRef(u32), UserRef(u32), UserNameRef(u32), ExceptionTableRef(u32), ExceptionTag(u32), TryCallRet(u32), TryCallExn(u32), Name(&'a str), String(&'a str), HexSequence(&'a str), Identifier(&'a str), SourceLoc(&'a str), }
58
59#[derive(Debug, PartialEq, Eq)]
61pub struct LocatedToken<'a> {
62 pub token: Token<'a>,
63 pub location: Location,
64}
65
66fn token(token: Token, loc: Location) -> Result<LocatedToken, LocatedError> {
68 Ok(LocatedToken {
69 token,
70 location: loc,
71 })
72}
73
74#[derive(Debug, Clone, Copy, PartialEq, Eq)]
76pub enum LexError {
77 InvalidChar,
78}
79
80#[derive(Debug, Clone, Copy, PartialEq, Eq)]
82pub struct LocatedError {
83 pub error: LexError,
84 pub location: Location,
85}
86
87fn error<'a>(error: LexError, loc: Location) -> Result<LocatedToken<'a>, LocatedError> {
89 Err(LocatedError {
90 error,
91 location: loc,
92 })
93}
94
95fn trailing_digits(s: &str) -> usize {
97 s.as_bytes()
99 .iter()
100 .rev()
101 .take_while(|&&b| b'0' <= b && b <= b'9')
102 .count()
103}
104
105pub fn split_entity_name(name: &str) -> Option<(&str, u32)> {
108 let (head, tail) = name.split_at(name.len() - trailing_digits(name));
109 if tail.len() > 1 && tail.starts_with('0') {
110 None
111 } else {
112 tail.parse().ok().map(|n| (head, n))
113 }
114}
115
116pub struct Lexer<'a> {
123 source: &'a str,
125
126 chars: CharIndices<'a>,
128
129 lookahead: Option<char>,
131
132 pos: usize,
134
135 line_number: usize,
137}
138
139impl<'a> Lexer<'a> {
140 pub fn new(s: &'a str) -> Self {
141 let mut lex = Self {
142 source: s,
143 chars: s.char_indices(),
144 lookahead: None,
145 pos: 0,
146 line_number: 1,
147 };
148 lex.next_ch();
150 lex
151 }
152
153 fn next_ch(&mut self) -> Option<char> {
157 if self.lookahead == Some('\n') {
158 self.line_number += 1;
159 }
160 match self.chars.next() {
161 Some((idx, ch)) => {
162 self.pos = idx;
163 self.lookahead = Some(ch);
164 }
165 None => {
166 self.pos = self.source.len();
167 self.lookahead = None;
168 }
169 }
170 self.lookahead
171 }
172
173 fn loc(&self) -> Location {
175 Location {
176 line_number: self.line_number,
177 }
178 }
179
180 fn looking_at(&self, prefix: &str) -> bool {
182 self.source[self.pos..].starts_with(prefix)
183 }
184
185 fn looking_at_numeric(&self) -> bool {
187 if let Some(c) = self.lookahead {
188 match c {
189 '0'..='9' => return true,
190 '-' => return true,
191 '+' => return true,
192 '.' => return true,
193 _ => {}
194 }
195 if self.looking_at("NaN") || self.looking_at("Inf") || self.looking_at("sNaN") {
196 return true;
197 }
198 }
199 false
200 }
201
202 fn scan_char(&mut self, tok: Token<'a>) -> Result<LocatedToken<'a>, LocatedError> {
204 assert_ne!(self.lookahead, None);
205 let loc = self.loc();
206 self.next_ch();
207 token(tok, loc)
208 }
209
210 fn scan_chars(
212 &mut self,
213 count: usize,
214 tok: Token<'a>,
215 ) -> Result<LocatedToken<'a>, LocatedError> {
216 let loc = self.loc();
217 for _ in 0..count {
218 assert_ne!(self.lookahead, None);
219 self.next_ch();
220 }
221 token(tok, loc)
222 }
223
224 pub fn rest_of_line(&mut self) -> &'a str {
227 let begin = self.pos;
228 loop {
229 match self.next_ch() {
230 None | Some('\n') => return &self.source[begin..self.pos],
231 _ => {}
232 }
233 }
234 }
235
236 fn scan_comment(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
238 let loc = self.loc();
239 let text = self.rest_of_line();
240 token(Token::Comment(text), loc)
241 }
242
243 fn scan_number(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
259 let begin = self.pos;
260 let loc = self.loc();
261 let mut is_float = false;
262
263 match self.lookahead {
265 Some('-') => {
266 self.next_ch();
267 if !self.looking_at_numeric() {
268 return token(Token::Minus, loc);
270 }
271 }
272 Some('+') => {
273 self.next_ch();
274 if !self.looking_at_numeric() {
275 return token(Token::Plus, loc);
277 }
278 }
279 _ => {}
280 }
281
282 if self.looking_at("NaN:") || self.looking_at("sNaN:") {
284 while self.next_ch() != Some(':') {}
287 is_float = true;
288 } else if self.looking_at("NaN") || self.looking_at("Inf") {
289 is_float = true;
291 }
292
293 loop {
295 match self.next_ch() {
296 Some('-') | Some('_') => {}
297 Some('.') => is_float = true,
298 Some('0'..='9') | Some('a'..='z') | Some('A'..='Z') => {}
299 _ => break,
300 }
301 }
302 let text = &self.source[begin..self.pos];
303 if is_float {
304 token(Token::Float(text), loc)
305 } else {
306 token(Token::Integer(text), loc)
307 }
308 }
309
310 fn scan_word(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
313 let begin = self.pos;
314 let loc = self.loc();
315
316 assert!(self.lookahead == Some('_') || self.lookahead.unwrap().is_ascii_alphabetic());
317 loop {
318 match self.next_ch() {
319 Some('_') | Some('0'..='9') | Some('a'..='z') | Some('A'..='Z') => {}
320 _ => break,
321 }
322 }
323 let text = &self.source[begin..self.pos];
324
325 token(
327 split_entity_name(text)
328 .and_then(|(prefix, number)| {
329 Self::numbered_entity(prefix, number)
330 .or_else(|| Self::value_type(text, prefix, number))
331 })
332 .unwrap_or_else(|| match text {
333 "cold" => Token::Cold,
334 _ => Token::Identifier(text),
335 }),
336 loc,
337 )
338 }
339
340 fn numbered_entity(prefix: &str, number: u32) -> Option<Token<'a>> {
343 match prefix {
344 "v" => Value::with_number(number).map(Token::Value),
345 "block" => Block::with_number(number).map(Token::Block),
346 "ss" => Some(Token::StackSlot(number)),
347 "dss" => Some(Token::DynamicStackSlot(number)),
348 "dt" => Some(Token::DynamicType(number)),
349 "gv" => Some(Token::GlobalValue(number)),
350 "mt" => Some(Token::MemoryType(number)),
351 "const" => Some(Token::Constant(number)),
352 "fn" => Some(Token::FuncRef(number)),
353 "sig" => Some(Token::SigRef(number)),
354 "u" => Some(Token::UserRef(number)),
355 "userextname" => Some(Token::UserNameRef(number)),
356 "extable" => Some(Token::ExceptionTableRef(number)),
357 "tag" => Some(Token::ExceptionTag(number)),
358 "ret" => Some(Token::TryCallRet(number)),
359 "exn" => Some(Token::TryCallExn(number)),
360 _ => None,
361 }
362 }
363
364 fn value_type(text: &str, prefix: &str, number: u32) -> Option<Token<'a>> {
366 let is_vector = prefix.ends_with('x');
367 let scalar = if is_vector {
368 &prefix[0..prefix.len() - 1]
369 } else {
370 text
371 };
372 let base_type = match scalar {
373 "i8" => types::I8,
374 "i16" => types::I16,
375 "i32" => types::I32,
376 "i64" => types::I64,
377 "i128" => types::I128,
378 "f16" => types::F16,
379 "f32" => types::F32,
380 "f64" => types::F64,
381 "f128" => types::F128,
382 _ => return None,
383 };
384 if is_vector {
385 if number <= u32::from(u16::MAX) {
386 base_type.by(number).map(Token::Type)
387 } else {
388 None
389 }
390 } else {
391 Some(Token::Type(base_type))
392 }
393 }
394
395 fn scan_name(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
396 let loc = self.loc();
397 let begin = self.pos + 1;
398
399 assert_eq!(self.lookahead, Some('%'));
400
401 loop {
402 match self.next_ch() {
403 Some('_') | Some('0'..='9') | Some('a'..='z') | Some('A'..='Z') => {}
404 _ => break,
405 }
406 }
407
408 let end = self.pos;
409 token(Token::Name(&self.source[begin..end]), loc)
410 }
411
412 fn scan_string(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
414 let loc = self.loc();
415 let begin = self.pos + 1;
416
417 assert_eq!(self.lookahead, Some('"'));
418
419 while let Some(c) = self.next_ch() {
420 if c == '"' {
421 break;
422 }
423 }
424
425 let end = self.pos;
426 if self.lookahead != Some('"') {
427 return error(LexError::InvalidChar, self.loc());
428 }
429 self.next_ch();
430 token(Token::String(&self.source[begin..end]), loc)
431 }
432
433 fn scan_hex_sequence(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
434 let loc = self.loc();
435 let begin = self.pos + 1;
436
437 assert_eq!(self.lookahead, Some('#'));
438
439 while let Some(c) = self.next_ch() {
440 if !char::is_digit(c, 16) {
441 break;
442 }
443 }
444
445 let end = self.pos;
446 token(Token::HexSequence(&self.source[begin..end]), loc)
447 }
448
449 fn looking_at_srcloc(&self) -> bool {
452 match self.lookahead {
453 Some(c) => char::is_digit(c, 16),
454 _ => false,
455 }
456 }
457
458 fn scan_srcloc(&mut self, pos: usize, loc: Location) -> Result<LocatedToken<'a>, LocatedError> {
459 let begin = pos + 1;
460 while let Some(c) = self.next_ch() {
461 if !char::is_digit(c, 16) {
462 break;
463 }
464 }
465
466 let end = self.pos;
467 token(Token::SourceLoc(&self.source[begin..end]), loc)
468 }
469
470 pub fn next(&mut self) -> Option<Result<LocatedToken<'a>, LocatedError>> {
474 loop {
475 let loc = self.loc();
476 return match self.lookahead {
477 None => None,
478 Some(';') => Some(self.scan_comment()),
479 Some('(') => Some(self.scan_char(Token::LPar)),
480 Some(')') => Some(self.scan_char(Token::RPar)),
481 Some('{') => Some(self.scan_char(Token::LBrace)),
482 Some('}') => Some(self.scan_char(Token::RBrace)),
483 Some('[') => Some(self.scan_char(Token::LBracket)),
484 Some(']') => Some(self.scan_char(Token::RBracket)),
485 Some(',') => Some(self.scan_char(Token::Comma)),
486 Some('.') => Some(self.scan_char(Token::Dot)),
487 Some(':') => Some(self.scan_char(Token::Colon)),
488 Some('=') => Some(self.scan_char(Token::Equal)),
489 Some('!') => Some(self.scan_char(Token::Bang)),
490 Some('+') => Some(self.scan_number()),
491 Some('*') => Some(self.scan_char(Token::Multiply)),
492 Some('-') => {
493 if self.looking_at("->") {
494 Some(self.scan_chars(2, Token::Arrow))
495 } else {
496 Some(self.scan_number())
497 }
498 }
499 Some('0'..='9') => Some(self.scan_number()),
500 Some('a'..='z') | Some('A'..='Z') => {
501 if self.looking_at("NaN") || self.looking_at("Inf") {
502 Some(self.scan_number())
503 } else {
504 Some(self.scan_word())
505 }
506 }
507 Some('%') => Some(self.scan_name()),
508 Some('"') => Some(self.scan_string()),
509 Some('#') => Some(self.scan_hex_sequence()),
510 Some('@') => {
511 let pos = self.pos;
512 let loc = self.loc();
513 self.next_ch();
514 if self.looking_at_srcloc() {
515 Some(self.scan_srcloc(pos, loc))
516 } else {
517 Some(token(Token::At, loc))
518 }
519 }
520 Some(' ') | Some('\x09'..='\x0d') => {
522 self.next_ch();
523 continue;
524 }
525 _ => {
526 self.next_ch();
528 Some(error(LexError::InvalidChar, loc))
529 }
530 };
531 }
532 }
533}
534
535#[cfg(test)]
536mod tests {
537 use super::*;
538
539 #[test]
540 fn digits() {
541 assert_eq!(trailing_digits(""), 0);
542 assert_eq!(trailing_digits("x"), 0);
543 assert_eq!(trailing_digits("0x"), 0);
544 assert_eq!(trailing_digits("x1"), 1);
545 assert_eq!(trailing_digits("1x1"), 1);
546 assert_eq!(trailing_digits("1x01"), 2);
547 }
548
549 #[test]
550 fn entity_name() {
551 assert_eq!(split_entity_name(""), None);
552 assert_eq!(split_entity_name("x"), None);
553 assert_eq!(split_entity_name("x+"), None);
554 assert_eq!(split_entity_name("x+1"), Some(("x+", 1)));
555 assert_eq!(split_entity_name("x-1"), Some(("x-", 1)));
556 assert_eq!(split_entity_name("1"), Some(("", 1)));
557 assert_eq!(split_entity_name("x1"), Some(("x", 1)));
558 assert_eq!(split_entity_name("xy0"), Some(("xy", 0)));
559 assert_eq!(split_entity_name("inst01"), None);
561 }
562
563 fn token<'a>(token: Token<'a>, line: usize) -> Option<Result<LocatedToken<'a>, LocatedError>> {
564 Some(super::token(token, Location { line_number: line }))
565 }
566
567 fn error<'a>(error: LexError, line: usize) -> Option<Result<LocatedToken<'a>, LocatedError>> {
568 Some(super::error(error, Location { line_number: line }))
569 }
570
571 #[test]
572 fn make_lexer() {
573 let mut l1 = Lexer::new("");
574 let mut l2 = Lexer::new(" ");
575 let mut l3 = Lexer::new("\n ");
576
577 assert_eq!(l1.next(), None);
578 assert_eq!(l2.next(), None);
579 assert_eq!(l3.next(), None);
580 }
581
582 #[test]
583 fn lex_comment() {
584 let mut lex = Lexer::new("; hello");
585 assert_eq!(lex.next(), token(Token::Comment("; hello"), 1));
586 assert_eq!(lex.next(), None);
587
588 lex = Lexer::new("\n ;hello\n;foo");
589 assert_eq!(lex.next(), token(Token::Comment(";hello"), 2));
590 assert_eq!(lex.next(), token(Token::Comment(";foo"), 3));
591 assert_eq!(lex.next(), None);
592
593 let mut lex = Lexer::new("$; hello");
595 assert_eq!(lex.next(), error(LexError::InvalidChar, 1));
596 assert_eq!(lex.next(), token(Token::Comment("; hello"), 1));
597 assert_eq!(lex.next(), None);
598 }
599
600 #[test]
601 fn lex_chars() {
602 let mut lex = Lexer::new("(); hello\n = :{, }.");
603 assert_eq!(lex.next(), token(Token::LPar, 1));
604 assert_eq!(lex.next(), token(Token::RPar, 1));
605 assert_eq!(lex.next(), token(Token::Comment("; hello"), 1));
606 assert_eq!(lex.next(), token(Token::Equal, 2));
607 assert_eq!(lex.next(), token(Token::Colon, 2));
608 assert_eq!(lex.next(), token(Token::LBrace, 2));
609 assert_eq!(lex.next(), token(Token::Comma, 2));
610 assert_eq!(lex.next(), token(Token::RBrace, 2));
611 assert_eq!(lex.next(), token(Token::Dot, 2));
612 assert_eq!(lex.next(), None);
613 }
614
615 #[test]
616 fn lex_numbers() {
617 let mut lex = Lexer::new(" 0 2_000 -1,0xf -0x0 0.0 0x0.4p-34 NaN +5");
618 assert_eq!(lex.next(), token(Token::Integer("0"), 1));
619 assert_eq!(lex.next(), token(Token::Integer("2_000"), 1));
620 assert_eq!(lex.next(), token(Token::Integer("-1"), 1));
621 assert_eq!(lex.next(), token(Token::Comma, 1));
622 assert_eq!(lex.next(), token(Token::Integer("0xf"), 1));
623 assert_eq!(lex.next(), token(Token::Integer("-0x0"), 1));
624 assert_eq!(lex.next(), token(Token::Float("0.0"), 1));
625 assert_eq!(lex.next(), token(Token::Float("0x0.4p-34"), 1));
626 assert_eq!(lex.next(), token(Token::Float("NaN"), 1));
627 assert_eq!(lex.next(), token(Token::Integer("+5"), 1));
628 assert_eq!(lex.next(), None);
629 }
630
631 #[test]
632 fn lex_identifiers() {
633 let mut lex = Lexer::new(
634 "v0 v00 vx01 block1234567890 block5234567890 v1x vx1 vxvx4 \
635 function0 function i8 i32x4 f32x5 f16 f128",
636 );
637 assert_eq!(
638 lex.next(),
639 token(Token::Value(Value::with_number(0).unwrap()), 1)
640 );
641 assert_eq!(lex.next(), token(Token::Identifier("v00"), 1));
642 assert_eq!(lex.next(), token(Token::Identifier("vx01"), 1));
643 assert_eq!(
644 lex.next(),
645 token(Token::Block(Block::with_number(1234567890).unwrap()), 1)
646 );
647 assert_eq!(lex.next(), token(Token::Identifier("block5234567890"), 1));
648 assert_eq!(lex.next(), token(Token::Identifier("v1x"), 1));
649 assert_eq!(lex.next(), token(Token::Identifier("vx1"), 1));
650 assert_eq!(lex.next(), token(Token::Identifier("vxvx4"), 1));
651 assert_eq!(lex.next(), token(Token::Identifier("function0"), 1));
652 assert_eq!(lex.next(), token(Token::Identifier("function"), 1));
653 assert_eq!(lex.next(), token(Token::Type(types::I8), 1));
654 assert_eq!(lex.next(), token(Token::Type(types::I32X4), 1));
655 assert_eq!(lex.next(), token(Token::Identifier("f32x5"), 1));
656 assert_eq!(lex.next(), token(Token::Type(types::F16), 1));
657 assert_eq!(lex.next(), token(Token::Type(types::F128), 1));
658 assert_eq!(lex.next(), None);
659 }
660
661 #[test]
662 fn lex_hex_sequences() {
663 let mut lex = Lexer::new("#0 #DEADbeef123 #789");
664
665 assert_eq!(lex.next(), token(Token::HexSequence("0"), 1));
666 assert_eq!(lex.next(), token(Token::HexSequence("DEADbeef123"), 1));
667 assert_eq!(lex.next(), token(Token::HexSequence("789"), 1));
668 }
669
670 #[test]
671 fn lex_names() {
672 let mut lex = Lexer::new("%0 %x3 %function %123_abc %ss0 %v3 %block11 %const42 %_");
673
674 assert_eq!(lex.next(), token(Token::Name("0"), 1));
675 assert_eq!(lex.next(), token(Token::Name("x3"), 1));
676 assert_eq!(lex.next(), token(Token::Name("function"), 1));
677 assert_eq!(lex.next(), token(Token::Name("123_abc"), 1));
678 assert_eq!(lex.next(), token(Token::Name("ss0"), 1));
679 assert_eq!(lex.next(), token(Token::Name("v3"), 1));
680 assert_eq!(lex.next(), token(Token::Name("block11"), 1));
681 assert_eq!(lex.next(), token(Token::Name("const42"), 1));
682 assert_eq!(lex.next(), token(Token::Name("_"), 1));
683 }
684
685 #[test]
686 fn lex_strings() {
687 let mut lex = Lexer::new(
688 r#""" "0" "x3""function" "123 abc" "\" "start
689 and end on
690 different lines" "#,
691 );
692
693 assert_eq!(lex.next(), token(Token::String(""), 1));
694 assert_eq!(lex.next(), token(Token::String("0"), 1));
695 assert_eq!(lex.next(), token(Token::String("x3"), 1));
696 assert_eq!(lex.next(), token(Token::String("function"), 1));
697 assert_eq!(lex.next(), token(Token::String("123 abc"), 1));
698 assert_eq!(lex.next(), token(Token::String(r#"\"#), 1));
699 assert_eq!(
700 lex.next(),
701 token(
702 Token::String(
703 r#"start
704 and end on
705 different lines"#
706 ),
707 1
708 )
709 );
710 }
711
712 #[test]
713 fn lex_userrefs() {
714 let mut lex = Lexer::new("u0 u1 u234567890 u9:8765");
715
716 assert_eq!(lex.next(), token(Token::UserRef(0), 1));
717 assert_eq!(lex.next(), token(Token::UserRef(1), 1));
718 assert_eq!(lex.next(), token(Token::UserRef(234567890), 1));
719 assert_eq!(lex.next(), token(Token::UserRef(9), 1));
720 assert_eq!(lex.next(), token(Token::Colon, 1));
721 assert_eq!(lex.next(), token(Token::Integer("8765"), 1));
722 assert_eq!(lex.next(), None);
723 }
724}