1use crate::error::Location;
4use cranelift_codegen::ir::types;
5use cranelift_codegen::ir::{Block, Value};
6use std::str::CharIndices;
7use std::u16;
8
9#[derive(Debug, PartialEq, Eq, Clone, Copy)]
14pub enum Token<'a> {
15 Comment(&'a str),
16 LPar, RPar, LBrace, RBrace, LBracket, RBracket, LAngle, RAngle, Minus, Plus, Multiply, Comma, Dot, Colon, Equal, Bang, At, Arrow, Float(&'a str), Integer(&'a str), Type(types::Type), DynamicType(u32), Value(Value), Block(Block), Cold, StackSlot(u32), DynamicStackSlot(u32), GlobalValue(u32), MemoryType(u32), Constant(u32), FuncRef(u32), SigRef(u32), UserRef(u32), UserNameRef(u32), ExceptionTableRef(u32), ExceptionTag(u32), AliasRegion(u32), TryCallRet(u32), TryCallExn(u32), Name(&'a str), String(&'a str), HexSequence(&'a str), Identifier(&'a str), SourceLoc(&'a str), }
61
62#[derive(Debug, PartialEq, Eq)]
64pub struct LocatedToken<'a> {
65 pub token: Token<'a>,
66 pub location: Location,
67}
68
69fn token(token: Token, loc: Location) -> Result<LocatedToken, LocatedError> {
71 Ok(LocatedToken {
72 token,
73 location: loc,
74 })
75}
76
77#[derive(Debug, Clone, Copy, PartialEq, Eq)]
79pub enum LexError {
80 InvalidChar,
81}
82
83#[derive(Debug, Clone, Copy, PartialEq, Eq)]
85pub struct LocatedError {
86 pub error: LexError,
87 pub location: Location,
88}
89
90fn error<'a>(error: LexError, loc: Location) -> Result<LocatedToken<'a>, LocatedError> {
92 Err(LocatedError {
93 error,
94 location: loc,
95 })
96}
97
98fn trailing_digits(s: &str) -> usize {
100 s.as_bytes()
102 .iter()
103 .rev()
104 .take_while(|&&b| b'0' <= b && b <= b'9')
105 .count()
106}
107
108pub fn split_entity_name(name: &str) -> Option<(&str, u32)> {
111 let (head, tail) = name.split_at(name.len() - trailing_digits(name));
112 if tail.len() > 1 && tail.starts_with('0') {
113 None
114 } else {
115 tail.parse().ok().map(|n| (head, n))
116 }
117}
118
119pub struct Lexer<'a> {
126 source: &'a str,
128
129 chars: CharIndices<'a>,
131
132 lookahead: Option<char>,
134
135 pos: usize,
137
138 line_number: usize,
140}
141
142impl<'a> Lexer<'a> {
143 pub fn new(s: &'a str) -> Self {
144 let mut lex = Self {
145 source: s,
146 chars: s.char_indices(),
147 lookahead: None,
148 pos: 0,
149 line_number: 1,
150 };
151 lex.next_ch();
153 lex
154 }
155
156 fn next_ch(&mut self) -> Option<char> {
160 if self.lookahead == Some('\n') {
161 self.line_number += 1;
162 }
163 match self.chars.next() {
164 Some((idx, ch)) => {
165 self.pos = idx;
166 self.lookahead = Some(ch);
167 }
168 None => {
169 self.pos = self.source.len();
170 self.lookahead = None;
171 }
172 }
173 self.lookahead
174 }
175
176 fn loc(&self) -> Location {
178 Location {
179 line_number: self.line_number,
180 }
181 }
182
183 fn looking_at(&self, prefix: &str) -> bool {
185 self.source[self.pos..].starts_with(prefix)
186 }
187
188 fn looking_at_numeric(&self) -> bool {
190 if let Some(c) = self.lookahead {
191 match c {
192 '0'..='9' => return true,
193 '-' => return true,
194 '+' => return true,
195 '.' => return true,
196 _ => {}
197 }
198 if self.looking_at("NaN") || self.looking_at("Inf") || self.looking_at("sNaN") {
199 return true;
200 }
201 }
202 false
203 }
204
205 fn scan_char(&mut self, tok: Token<'a>) -> Result<LocatedToken<'a>, LocatedError> {
207 assert_ne!(self.lookahead, None);
208 let loc = self.loc();
209 self.next_ch();
210 token(tok, loc)
211 }
212
213 fn scan_chars(
215 &mut self,
216 count: usize,
217 tok: Token<'a>,
218 ) -> Result<LocatedToken<'a>, LocatedError> {
219 let loc = self.loc();
220 for _ in 0..count {
221 assert_ne!(self.lookahead, None);
222 self.next_ch();
223 }
224 token(tok, loc)
225 }
226
227 pub fn rest_of_line(&mut self) -> &'a str {
230 let begin = self.pos;
231 loop {
232 match self.next_ch() {
233 None | Some('\n') => return &self.source[begin..self.pos],
234 _ => {}
235 }
236 }
237 }
238
239 fn scan_comment(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
241 let loc = self.loc();
242 let text = self.rest_of_line();
243 token(Token::Comment(text), loc)
244 }
245
246 fn scan_number(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
262 let begin = self.pos;
263 let loc = self.loc();
264 let mut is_float = false;
265
266 match self.lookahead {
268 Some('-') => {
269 self.next_ch();
270 if !self.looking_at_numeric() {
271 return token(Token::Minus, loc);
273 }
274 }
275 Some('+') => {
276 self.next_ch();
277 if !self.looking_at_numeric() {
278 return token(Token::Plus, loc);
280 }
281 }
282 _ => {}
283 }
284
285 if self.looking_at("NaN:") || self.looking_at("sNaN:") {
287 while self.next_ch() != Some(':') {}
290 is_float = true;
291 } else if self.looking_at("NaN") || self.looking_at("Inf") {
292 is_float = true;
294 }
295
296 loop {
298 match self.next_ch() {
299 Some('-') | Some('_') => {}
300 Some('.') => is_float = true,
301 Some('0'..='9') | Some('a'..='z') | Some('A'..='Z') => {}
302 _ => break,
303 }
304 }
305 let text = &self.source[begin..self.pos];
306 if is_float {
307 token(Token::Float(text), loc)
308 } else {
309 token(Token::Integer(text), loc)
310 }
311 }
312
313 fn scan_word(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
316 let begin = self.pos;
317 let loc = self.loc();
318
319 assert!(self.lookahead == Some('_') || self.lookahead.unwrap().is_ascii_alphabetic());
320 loop {
321 match self.next_ch() {
322 Some('_') | Some('0'..='9') | Some('a'..='z') | Some('A'..='Z') => {}
323 _ => break,
324 }
325 }
326 let text = &self.source[begin..self.pos];
327
328 token(
330 split_entity_name(text)
331 .and_then(|(prefix, number)| {
332 Self::numbered_entity(prefix, number)
333 .or_else(|| Self::value_type(text, prefix, number))
334 })
335 .unwrap_or_else(|| match text {
336 "cold" => Token::Cold,
337 _ => Token::Identifier(text),
338 }),
339 loc,
340 )
341 }
342
343 fn numbered_entity(prefix: &str, number: u32) -> Option<Token<'a>> {
346 match prefix {
347 "v" => Value::with_number(number).map(Token::Value),
348 "block" => Block::with_number(number).map(Token::Block),
349 "ss" => Some(Token::StackSlot(number)),
350 "dss" => Some(Token::DynamicStackSlot(number)),
351 "dt" => Some(Token::DynamicType(number)),
352 "gv" => Some(Token::GlobalValue(number)),
353 "mt" => Some(Token::MemoryType(number)),
354 "const" => Some(Token::Constant(number)),
355 "fn" => Some(Token::FuncRef(number)),
356 "sig" => Some(Token::SigRef(number)),
357 "u" => Some(Token::UserRef(number)),
358 "userextname" => Some(Token::UserNameRef(number)),
359 "extable" => Some(Token::ExceptionTableRef(number)),
360 "tag" => Some(Token::ExceptionTag(number)),
361 "region" => Some(Token::AliasRegion(number)),
362 "ret" => Some(Token::TryCallRet(number)),
363 "exn" => Some(Token::TryCallExn(number)),
364 _ => None,
365 }
366 }
367
368 fn value_type(text: &str, prefix: &str, number: u32) -> Option<Token<'a>> {
370 let is_vector = prefix.ends_with('x');
371 let scalar = if is_vector {
372 &prefix[0..prefix.len() - 1]
373 } else {
374 text
375 };
376 let base_type = match scalar {
377 "i8" => types::I8,
378 "i16" => types::I16,
379 "i32" => types::I32,
380 "i64" => types::I64,
381 "i128" => types::I128,
382 "f16" => types::F16,
383 "f32" => types::F32,
384 "f64" => types::F64,
385 "f128" => types::F128,
386 _ => return None,
387 };
388 if is_vector {
389 if number <= u32::from(u16::MAX) {
390 base_type.by(number).map(Token::Type)
391 } else {
392 None
393 }
394 } else {
395 Some(Token::Type(base_type))
396 }
397 }
398
399 fn scan_name(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
400 let loc = self.loc();
401 let begin = self.pos + 1;
402
403 assert_eq!(self.lookahead, Some('%'));
404
405 loop {
406 match self.next_ch() {
407 Some('_') | Some('0'..='9') | Some('a'..='z') | Some('A'..='Z') => {}
408 _ => break,
409 }
410 }
411
412 let end = self.pos;
413 token(Token::Name(&self.source[begin..end]), loc)
414 }
415
416 fn scan_string(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
418 let loc = self.loc();
419 let begin = self.pos + 1;
420
421 assert_eq!(self.lookahead, Some('"'));
422
423 while let Some(c) = self.next_ch() {
424 if c == '"' {
425 break;
426 }
427 }
428
429 let end = self.pos;
430 if self.lookahead != Some('"') {
431 return error(LexError::InvalidChar, self.loc());
432 }
433 self.next_ch();
434 token(Token::String(&self.source[begin..end]), loc)
435 }
436
437 fn scan_hex_sequence(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
438 let loc = self.loc();
439 let begin = self.pos + 1;
440
441 assert_eq!(self.lookahead, Some('#'));
442
443 while let Some(c) = self.next_ch() {
444 if !char::is_digit(c, 16) {
445 break;
446 }
447 }
448
449 let end = self.pos;
450 token(Token::HexSequence(&self.source[begin..end]), loc)
451 }
452
453 fn looking_at_srcloc(&self) -> bool {
456 match self.lookahead {
457 Some(c) => char::is_digit(c, 16),
458 _ => false,
459 }
460 }
461
462 fn scan_srcloc(&mut self, pos: usize, loc: Location) -> Result<LocatedToken<'a>, LocatedError> {
463 let begin = pos + 1;
464 while let Some(c) = self.next_ch() {
465 if !char::is_digit(c, 16) {
466 break;
467 }
468 }
469
470 let end = self.pos;
471 token(Token::SourceLoc(&self.source[begin..end]), loc)
472 }
473
474 pub fn next(&mut self) -> Option<Result<LocatedToken<'a>, LocatedError>> {
478 loop {
479 let loc = self.loc();
480 return match self.lookahead {
481 None => None,
482 Some(';') => Some(self.scan_comment()),
483 Some('(') => Some(self.scan_char(Token::LPar)),
484 Some(')') => Some(self.scan_char(Token::RPar)),
485 Some('{') => Some(self.scan_char(Token::LBrace)),
486 Some('}') => Some(self.scan_char(Token::RBrace)),
487 Some('[') => Some(self.scan_char(Token::LBracket)),
488 Some(']') => Some(self.scan_char(Token::RBracket)),
489 Some('<') => Some(self.scan_char(Token::LAngle)),
490 Some('>') => Some(self.scan_char(Token::RAngle)),
491 Some(',') => Some(self.scan_char(Token::Comma)),
492 Some('.') => Some(self.scan_char(Token::Dot)),
493 Some(':') => Some(self.scan_char(Token::Colon)),
494 Some('=') => Some(self.scan_char(Token::Equal)),
495 Some('!') => Some(self.scan_char(Token::Bang)),
496 Some('+') => Some(self.scan_number()),
497 Some('*') => Some(self.scan_char(Token::Multiply)),
498 Some('-') => {
499 if self.looking_at("->") {
500 Some(self.scan_chars(2, Token::Arrow))
501 } else {
502 Some(self.scan_number())
503 }
504 }
505 Some('0'..='9') => Some(self.scan_number()),
506 Some('a'..='z') | Some('A'..='Z') => {
507 if self.looking_at("NaN") || self.looking_at("Inf") {
508 Some(self.scan_number())
509 } else {
510 Some(self.scan_word())
511 }
512 }
513 Some('%') => Some(self.scan_name()),
514 Some('"') => Some(self.scan_string()),
515 Some('#') => Some(self.scan_hex_sequence()),
516 Some('@') => {
517 let pos = self.pos;
518 let loc = self.loc();
519 self.next_ch();
520 if self.looking_at_srcloc() {
521 Some(self.scan_srcloc(pos, loc))
522 } else {
523 Some(token(Token::At, loc))
524 }
525 }
526 Some(' ') | Some('\x09'..='\x0d') => {
528 self.next_ch();
529 continue;
530 }
531 _ => {
532 self.next_ch();
534 Some(error(LexError::InvalidChar, loc))
535 }
536 };
537 }
538 }
539}
540
541#[cfg(test)]
542mod tests {
543 use super::*;
544
545 #[test]
546 fn digits() {
547 assert_eq!(trailing_digits(""), 0);
548 assert_eq!(trailing_digits("x"), 0);
549 assert_eq!(trailing_digits("0x"), 0);
550 assert_eq!(trailing_digits("x1"), 1);
551 assert_eq!(trailing_digits("1x1"), 1);
552 assert_eq!(trailing_digits("1x01"), 2);
553 }
554
555 #[test]
556 fn entity_name() {
557 assert_eq!(split_entity_name(""), None);
558 assert_eq!(split_entity_name("x"), None);
559 assert_eq!(split_entity_name("x+"), None);
560 assert_eq!(split_entity_name("x+1"), Some(("x+", 1)));
561 assert_eq!(split_entity_name("x-1"), Some(("x-", 1)));
562 assert_eq!(split_entity_name("1"), Some(("", 1)));
563 assert_eq!(split_entity_name("x1"), Some(("x", 1)));
564 assert_eq!(split_entity_name("xy0"), Some(("xy", 0)));
565 assert_eq!(split_entity_name("inst01"), None);
567 }
568
569 fn token<'a>(token: Token<'a>, line: usize) -> Option<Result<LocatedToken<'a>, LocatedError>> {
570 Some(super::token(token, Location { line_number: line }))
571 }
572
573 fn error<'a>(error: LexError, line: usize) -> Option<Result<LocatedToken<'a>, LocatedError>> {
574 Some(super::error(error, Location { line_number: line }))
575 }
576
577 #[test]
578 fn make_lexer() {
579 let mut l1 = Lexer::new("");
580 let mut l2 = Lexer::new(" ");
581 let mut l3 = Lexer::new("\n ");
582
583 assert_eq!(l1.next(), None);
584 assert_eq!(l2.next(), None);
585 assert_eq!(l3.next(), None);
586 }
587
588 #[test]
589 fn lex_comment() {
590 let mut lex = Lexer::new("; hello");
591 assert_eq!(lex.next(), token(Token::Comment("; hello"), 1));
592 assert_eq!(lex.next(), None);
593
594 lex = Lexer::new("\n ;hello\n;foo");
595 assert_eq!(lex.next(), token(Token::Comment(";hello"), 2));
596 assert_eq!(lex.next(), token(Token::Comment(";foo"), 3));
597 assert_eq!(lex.next(), None);
598
599 let mut lex = Lexer::new("$; hello");
601 assert_eq!(lex.next(), error(LexError::InvalidChar, 1));
602 assert_eq!(lex.next(), token(Token::Comment("; hello"), 1));
603 assert_eq!(lex.next(), None);
604 }
605
606 #[test]
607 fn lex_chars() {
608 let mut lex = Lexer::new("(); hello\n = :{, }.");
609 assert_eq!(lex.next(), token(Token::LPar, 1));
610 assert_eq!(lex.next(), token(Token::RPar, 1));
611 assert_eq!(lex.next(), token(Token::Comment("; hello"), 1));
612 assert_eq!(lex.next(), token(Token::Equal, 2));
613 assert_eq!(lex.next(), token(Token::Colon, 2));
614 assert_eq!(lex.next(), token(Token::LBrace, 2));
615 assert_eq!(lex.next(), token(Token::Comma, 2));
616 assert_eq!(lex.next(), token(Token::RBrace, 2));
617 assert_eq!(lex.next(), token(Token::Dot, 2));
618 assert_eq!(lex.next(), None);
619 }
620
621 #[test]
622 fn lex_numbers() {
623 let mut lex = Lexer::new(" 0 2_000 -1,0xf -0x0 0.0 0x0.4p-34 NaN +5");
624 assert_eq!(lex.next(), token(Token::Integer("0"), 1));
625 assert_eq!(lex.next(), token(Token::Integer("2_000"), 1));
626 assert_eq!(lex.next(), token(Token::Integer("-1"), 1));
627 assert_eq!(lex.next(), token(Token::Comma, 1));
628 assert_eq!(lex.next(), token(Token::Integer("0xf"), 1));
629 assert_eq!(lex.next(), token(Token::Integer("-0x0"), 1));
630 assert_eq!(lex.next(), token(Token::Float("0.0"), 1));
631 assert_eq!(lex.next(), token(Token::Float("0x0.4p-34"), 1));
632 assert_eq!(lex.next(), token(Token::Float("NaN"), 1));
633 assert_eq!(lex.next(), token(Token::Integer("+5"), 1));
634 assert_eq!(lex.next(), None);
635 }
636
637 #[test]
638 fn lex_identifiers() {
639 let mut lex = Lexer::new(
640 "v0 v00 vx01 block1234567890 block5234567890 v1x vx1 vxvx4 \
641 function0 function i8 i32x4 f32x5 f16 f128",
642 );
643 assert_eq!(
644 lex.next(),
645 token(Token::Value(Value::with_number(0).unwrap()), 1)
646 );
647 assert_eq!(lex.next(), token(Token::Identifier("v00"), 1));
648 assert_eq!(lex.next(), token(Token::Identifier("vx01"), 1));
649 assert_eq!(
650 lex.next(),
651 token(Token::Block(Block::with_number(1234567890).unwrap()), 1)
652 );
653 assert_eq!(lex.next(), token(Token::Identifier("block5234567890"), 1));
654 assert_eq!(lex.next(), token(Token::Identifier("v1x"), 1));
655 assert_eq!(lex.next(), token(Token::Identifier("vx1"), 1));
656 assert_eq!(lex.next(), token(Token::Identifier("vxvx4"), 1));
657 assert_eq!(lex.next(), token(Token::Identifier("function0"), 1));
658 assert_eq!(lex.next(), token(Token::Identifier("function"), 1));
659 assert_eq!(lex.next(), token(Token::Type(types::I8), 1));
660 assert_eq!(lex.next(), token(Token::Type(types::I32X4), 1));
661 assert_eq!(lex.next(), token(Token::Identifier("f32x5"), 1));
662 assert_eq!(lex.next(), token(Token::Type(types::F16), 1));
663 assert_eq!(lex.next(), token(Token::Type(types::F128), 1));
664 assert_eq!(lex.next(), None);
665 }
666
667 #[test]
668 fn lex_hex_sequences() {
669 let mut lex = Lexer::new("#0 #DEADbeef123 #789");
670
671 assert_eq!(lex.next(), token(Token::HexSequence("0"), 1));
672 assert_eq!(lex.next(), token(Token::HexSequence("DEADbeef123"), 1));
673 assert_eq!(lex.next(), token(Token::HexSequence("789"), 1));
674 }
675
676 #[test]
677 fn lex_names() {
678 let mut lex = Lexer::new("%0 %x3 %function %123_abc %ss0 %v3 %block11 %const42 %_");
679
680 assert_eq!(lex.next(), token(Token::Name("0"), 1));
681 assert_eq!(lex.next(), token(Token::Name("x3"), 1));
682 assert_eq!(lex.next(), token(Token::Name("function"), 1));
683 assert_eq!(lex.next(), token(Token::Name("123_abc"), 1));
684 assert_eq!(lex.next(), token(Token::Name("ss0"), 1));
685 assert_eq!(lex.next(), token(Token::Name("v3"), 1));
686 assert_eq!(lex.next(), token(Token::Name("block11"), 1));
687 assert_eq!(lex.next(), token(Token::Name("const42"), 1));
688 assert_eq!(lex.next(), token(Token::Name("_"), 1));
689 }
690
691 #[test]
692 fn lex_strings() {
693 let mut lex = Lexer::new(
694 r#""" "0" "x3""function" "123 abc" "\" "start
695 and end on
696 different lines" "#,
697 );
698
699 assert_eq!(lex.next(), token(Token::String(""), 1));
700 assert_eq!(lex.next(), token(Token::String("0"), 1));
701 assert_eq!(lex.next(), token(Token::String("x3"), 1));
702 assert_eq!(lex.next(), token(Token::String("function"), 1));
703 assert_eq!(lex.next(), token(Token::String("123 abc"), 1));
704 assert_eq!(lex.next(), token(Token::String(r#"\"#), 1));
705 assert_eq!(
706 lex.next(),
707 token(
708 Token::String(
709 r#"start
710 and end on
711 different lines"#
712 ),
713 1
714 )
715 );
716 }
717
718 #[test]
719 fn lex_userrefs() {
720 let mut lex = Lexer::new("u0 u1 u234567890 u9:8765");
721
722 assert_eq!(lex.next(), token(Token::UserRef(0), 1));
723 assert_eq!(lex.next(), token(Token::UserRef(1), 1));
724 assert_eq!(lex.next(), token(Token::UserRef(234567890), 1));
725 assert_eq!(lex.next(), token(Token::UserRef(9), 1));
726 assert_eq!(lex.next(), token(Token::Colon, 1));
727 assert_eq!(lex.next(), token(Token::Integer("8765"), 1));
728 assert_eq!(lex.next(), None);
729 }
730}