1use crate::error::Location;
4use cranelift_codegen::ir::types;
5use cranelift_codegen::ir::{Block, Value};
6use std::str::CharIndices;
7use std::u16;
8
9#[derive(Debug, PartialEq, Eq, Clone, Copy)]
14pub enum Token<'a> {
15 Comment(&'a str),
16 LPar, RPar, LBrace, RBrace, LBracket, RBracket, Minus, Plus, Multiply, Comma, Dot, Colon, Equal, Bang, At, Arrow, Float(&'a str), Integer(&'a str), Type(types::Type), DynamicType(u32), Value(Value), Block(Block), Cold, StackSlot(u32), DynamicStackSlot(u32), GlobalValue(u32), MemoryType(u32), Constant(u32), FuncRef(u32), SigRef(u32), UserRef(u32), UserNameRef(u32), Name(&'a str), String(&'a str), HexSequence(&'a str), Identifier(&'a str), SourceLoc(&'a str), }
54
55#[derive(Debug, PartialEq, Eq)]
57pub struct LocatedToken<'a> {
58 pub token: Token<'a>,
59 pub location: Location,
60}
61
62fn token(token: Token, loc: Location) -> Result<LocatedToken, LocatedError> {
64 Ok(LocatedToken {
65 token,
66 location: loc,
67 })
68}
69
70#[derive(Debug, Clone, Copy, PartialEq, Eq)]
72pub enum LexError {
73 InvalidChar,
74}
75
76#[derive(Debug, Clone, Copy, PartialEq, Eq)]
78pub struct LocatedError {
79 pub error: LexError,
80 pub location: Location,
81}
82
83fn error<'a>(error: LexError, loc: Location) -> Result<LocatedToken<'a>, LocatedError> {
85 Err(LocatedError {
86 error,
87 location: loc,
88 })
89}
90
91fn trailing_digits(s: &str) -> usize {
93 s.as_bytes()
95 .iter()
96 .rev()
97 .take_while(|&&b| b'0' <= b && b <= b'9')
98 .count()
99}
100
101pub fn split_entity_name(name: &str) -> Option<(&str, u32)> {
104 let (head, tail) = name.split_at(name.len() - trailing_digits(name));
105 if tail.len() > 1 && tail.starts_with('0') {
106 None
107 } else {
108 tail.parse().ok().map(|n| (head, n))
109 }
110}
111
112pub struct Lexer<'a> {
119 source: &'a str,
121
122 chars: CharIndices<'a>,
124
125 lookahead: Option<char>,
127
128 pos: usize,
130
131 line_number: usize,
133}
134
135impl<'a> Lexer<'a> {
136 pub fn new(s: &'a str) -> Self {
137 let mut lex = Self {
138 source: s,
139 chars: s.char_indices(),
140 lookahead: None,
141 pos: 0,
142 line_number: 1,
143 };
144 lex.next_ch();
146 lex
147 }
148
149 fn next_ch(&mut self) -> Option<char> {
153 if self.lookahead == Some('\n') {
154 self.line_number += 1;
155 }
156 match self.chars.next() {
157 Some((idx, ch)) => {
158 self.pos = idx;
159 self.lookahead = Some(ch);
160 }
161 None => {
162 self.pos = self.source.len();
163 self.lookahead = None;
164 }
165 }
166 self.lookahead
167 }
168
169 fn loc(&self) -> Location {
171 Location {
172 line_number: self.line_number,
173 }
174 }
175
176 fn looking_at(&self, prefix: &str) -> bool {
178 self.source[self.pos..].starts_with(prefix)
179 }
180
181 fn looking_at_numeric(&self) -> bool {
183 if let Some(c) = self.lookahead {
184 match c {
185 '0'..='9' => return true,
186 '-' => return true,
187 '+' => return true,
188 '.' => return true,
189 _ => {}
190 }
191 if self.looking_at("NaN") || self.looking_at("Inf") || self.looking_at("sNaN") {
192 return true;
193 }
194 }
195 false
196 }
197
198 fn scan_char(&mut self, tok: Token<'a>) -> Result<LocatedToken<'a>, LocatedError> {
200 assert_ne!(self.lookahead, None);
201 let loc = self.loc();
202 self.next_ch();
203 token(tok, loc)
204 }
205
206 fn scan_chars(
208 &mut self,
209 count: usize,
210 tok: Token<'a>,
211 ) -> Result<LocatedToken<'a>, LocatedError> {
212 let loc = self.loc();
213 for _ in 0..count {
214 assert_ne!(self.lookahead, None);
215 self.next_ch();
216 }
217 token(tok, loc)
218 }
219
220 pub fn rest_of_line(&mut self) -> &'a str {
223 let begin = self.pos;
224 loop {
225 match self.next_ch() {
226 None | Some('\n') => return &self.source[begin..self.pos],
227 _ => {}
228 }
229 }
230 }
231
232 fn scan_comment(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
234 let loc = self.loc();
235 let text = self.rest_of_line();
236 token(Token::Comment(text), loc)
237 }
238
239 fn scan_number(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
255 let begin = self.pos;
256 let loc = self.loc();
257 let mut is_float = false;
258
259 match self.lookahead {
261 Some('-') => {
262 self.next_ch();
263 if !self.looking_at_numeric() {
264 return token(Token::Minus, loc);
266 }
267 }
268 Some('+') => {
269 self.next_ch();
270 if !self.looking_at_numeric() {
271 return token(Token::Plus, loc);
273 }
274 }
275 _ => {}
276 }
277
278 if self.looking_at("NaN:") || self.looking_at("sNaN:") {
280 while self.next_ch() != Some(':') {}
283 is_float = true;
284 } else if self.looking_at("NaN") || self.looking_at("Inf") {
285 is_float = true;
287 }
288
289 loop {
291 match self.next_ch() {
292 Some('-') | Some('_') => {}
293 Some('.') => is_float = true,
294 Some('0'..='9') | Some('a'..='z') | Some('A'..='Z') => {}
295 _ => break,
296 }
297 }
298 let text = &self.source[begin..self.pos];
299 if is_float {
300 token(Token::Float(text), loc)
301 } else {
302 token(Token::Integer(text), loc)
303 }
304 }
305
306 fn scan_word(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
309 let begin = self.pos;
310 let loc = self.loc();
311
312 assert!(self.lookahead == Some('_') || self.lookahead.unwrap().is_ascii_alphabetic());
313 loop {
314 match self.next_ch() {
315 Some('_') | Some('0'..='9') | Some('a'..='z') | Some('A'..='Z') => {}
316 _ => break,
317 }
318 }
319 let text = &self.source[begin..self.pos];
320
321 token(
323 split_entity_name(text)
324 .and_then(|(prefix, number)| {
325 Self::numbered_entity(prefix, number)
326 .or_else(|| Self::value_type(text, prefix, number))
327 })
328 .unwrap_or_else(|| match text {
329 "cold" => Token::Cold,
330 _ => Token::Identifier(text),
331 }),
332 loc,
333 )
334 }
335
336 fn numbered_entity(prefix: &str, number: u32) -> Option<Token<'a>> {
339 match prefix {
340 "v" => Value::with_number(number).map(Token::Value),
341 "block" => Block::with_number(number).map(Token::Block),
342 "ss" => Some(Token::StackSlot(number)),
343 "dss" => Some(Token::DynamicStackSlot(number)),
344 "dt" => Some(Token::DynamicType(number)),
345 "gv" => Some(Token::GlobalValue(number)),
346 "mt" => Some(Token::MemoryType(number)),
347 "const" => Some(Token::Constant(number)),
348 "fn" => Some(Token::FuncRef(number)),
349 "sig" => Some(Token::SigRef(number)),
350 "u" => Some(Token::UserRef(number)),
351 "userextname" => Some(Token::UserNameRef(number)),
352 _ => None,
353 }
354 }
355
356 fn value_type(text: &str, prefix: &str, number: u32) -> Option<Token<'a>> {
358 let is_vector = prefix.ends_with('x');
359 let scalar = if is_vector {
360 &prefix[0..prefix.len() - 1]
361 } else {
362 text
363 };
364 let base_type = match scalar {
365 "i8" => types::I8,
366 "i16" => types::I16,
367 "i32" => types::I32,
368 "i64" => types::I64,
369 "i128" => types::I128,
370 "f16" => types::F16,
371 "f32" => types::F32,
372 "f64" => types::F64,
373 "f128" => types::F128,
374 _ => return None,
375 };
376 if is_vector {
377 if number <= u32::from(u16::MAX) {
378 base_type.by(number).map(Token::Type)
379 } else {
380 None
381 }
382 } else {
383 Some(Token::Type(base_type))
384 }
385 }
386
387 fn scan_name(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
388 let loc = self.loc();
389 let begin = self.pos + 1;
390
391 assert_eq!(self.lookahead, Some('%'));
392
393 loop {
394 match self.next_ch() {
395 Some('_') | Some('0'..='9') | Some('a'..='z') | Some('A'..='Z') => {}
396 _ => break,
397 }
398 }
399
400 let end = self.pos;
401 token(Token::Name(&self.source[begin..end]), loc)
402 }
403
404 fn scan_string(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
406 let loc = self.loc();
407 let begin = self.pos + 1;
408
409 assert_eq!(self.lookahead, Some('"'));
410
411 while let Some(c) = self.next_ch() {
412 if c == '"' {
413 break;
414 }
415 }
416
417 let end = self.pos;
418 if self.lookahead != Some('"') {
419 return error(LexError::InvalidChar, self.loc());
420 }
421 self.next_ch();
422 token(Token::String(&self.source[begin..end]), loc)
423 }
424
425 fn scan_hex_sequence(&mut self) -> Result<LocatedToken<'a>, LocatedError> {
426 let loc = self.loc();
427 let begin = self.pos + 1;
428
429 assert_eq!(self.lookahead, Some('#'));
430
431 while let Some(c) = self.next_ch() {
432 if !char::is_digit(c, 16) {
433 break;
434 }
435 }
436
437 let end = self.pos;
438 token(Token::HexSequence(&self.source[begin..end]), loc)
439 }
440
441 fn looking_at_srcloc(&self) -> bool {
444 match self.lookahead {
445 Some(c) => char::is_digit(c, 16),
446 _ => false,
447 }
448 }
449
450 fn scan_srcloc(&mut self, pos: usize, loc: Location) -> Result<LocatedToken<'a>, LocatedError> {
451 let begin = pos + 1;
452 while let Some(c) = self.next_ch() {
453 if !char::is_digit(c, 16) {
454 break;
455 }
456 }
457
458 let end = self.pos;
459 token(Token::SourceLoc(&self.source[begin..end]), loc)
460 }
461
462 pub fn next(&mut self) -> Option<Result<LocatedToken<'a>, LocatedError>> {
466 loop {
467 let loc = self.loc();
468 return match self.lookahead {
469 None => None,
470 Some(';') => Some(self.scan_comment()),
471 Some('(') => Some(self.scan_char(Token::LPar)),
472 Some(')') => Some(self.scan_char(Token::RPar)),
473 Some('{') => Some(self.scan_char(Token::LBrace)),
474 Some('}') => Some(self.scan_char(Token::RBrace)),
475 Some('[') => Some(self.scan_char(Token::LBracket)),
476 Some(']') => Some(self.scan_char(Token::RBracket)),
477 Some(',') => Some(self.scan_char(Token::Comma)),
478 Some('.') => Some(self.scan_char(Token::Dot)),
479 Some(':') => Some(self.scan_char(Token::Colon)),
480 Some('=') => Some(self.scan_char(Token::Equal)),
481 Some('!') => Some(self.scan_char(Token::Bang)),
482 Some('+') => Some(self.scan_number()),
483 Some('*') => Some(self.scan_char(Token::Multiply)),
484 Some('-') => {
485 if self.looking_at("->") {
486 Some(self.scan_chars(2, Token::Arrow))
487 } else {
488 Some(self.scan_number())
489 }
490 }
491 Some('0'..='9') => Some(self.scan_number()),
492 Some('a'..='z') | Some('A'..='Z') => {
493 if self.looking_at("NaN") || self.looking_at("Inf") {
494 Some(self.scan_number())
495 } else {
496 Some(self.scan_word())
497 }
498 }
499 Some('%') => Some(self.scan_name()),
500 Some('"') => Some(self.scan_string()),
501 Some('#') => Some(self.scan_hex_sequence()),
502 Some('@') => {
503 let pos = self.pos;
504 let loc = self.loc();
505 self.next_ch();
506 if self.looking_at_srcloc() {
507 Some(self.scan_srcloc(pos, loc))
508 } else {
509 Some(token(Token::At, loc))
510 }
511 }
512 Some(' ') | Some('\x09'..='\x0d') => {
514 self.next_ch();
515 continue;
516 }
517 _ => {
518 self.next_ch();
520 Some(error(LexError::InvalidChar, loc))
521 }
522 };
523 }
524 }
525}
526
527#[cfg(test)]
528mod tests {
529 use super::*;
530
531 #[test]
532 fn digits() {
533 assert_eq!(trailing_digits(""), 0);
534 assert_eq!(trailing_digits("x"), 0);
535 assert_eq!(trailing_digits("0x"), 0);
536 assert_eq!(trailing_digits("x1"), 1);
537 assert_eq!(trailing_digits("1x1"), 1);
538 assert_eq!(trailing_digits("1x01"), 2);
539 }
540
541 #[test]
542 fn entity_name() {
543 assert_eq!(split_entity_name(""), None);
544 assert_eq!(split_entity_name("x"), None);
545 assert_eq!(split_entity_name("x+"), None);
546 assert_eq!(split_entity_name("x+1"), Some(("x+", 1)));
547 assert_eq!(split_entity_name("x-1"), Some(("x-", 1)));
548 assert_eq!(split_entity_name("1"), Some(("", 1)));
549 assert_eq!(split_entity_name("x1"), Some(("x", 1)));
550 assert_eq!(split_entity_name("xy0"), Some(("xy", 0)));
551 assert_eq!(split_entity_name("inst01"), None);
553 }
554
555 fn token<'a>(token: Token<'a>, line: usize) -> Option<Result<LocatedToken<'a>, LocatedError>> {
556 Some(super::token(token, Location { line_number: line }))
557 }
558
559 fn error<'a>(error: LexError, line: usize) -> Option<Result<LocatedToken<'a>, LocatedError>> {
560 Some(super::error(error, Location { line_number: line }))
561 }
562
563 #[test]
564 fn make_lexer() {
565 let mut l1 = Lexer::new("");
566 let mut l2 = Lexer::new(" ");
567 let mut l3 = Lexer::new("\n ");
568
569 assert_eq!(l1.next(), None);
570 assert_eq!(l2.next(), None);
571 assert_eq!(l3.next(), None);
572 }
573
574 #[test]
575 fn lex_comment() {
576 let mut lex = Lexer::new("; hello");
577 assert_eq!(lex.next(), token(Token::Comment("; hello"), 1));
578 assert_eq!(lex.next(), None);
579
580 lex = Lexer::new("\n ;hello\n;foo");
581 assert_eq!(lex.next(), token(Token::Comment(";hello"), 2));
582 assert_eq!(lex.next(), token(Token::Comment(";foo"), 3));
583 assert_eq!(lex.next(), None);
584
585 let mut lex = Lexer::new("$; hello");
587 assert_eq!(lex.next(), error(LexError::InvalidChar, 1));
588 assert_eq!(lex.next(), token(Token::Comment("; hello"), 1));
589 assert_eq!(lex.next(), None);
590 }
591
592 #[test]
593 fn lex_chars() {
594 let mut lex = Lexer::new("(); hello\n = :{, }.");
595 assert_eq!(lex.next(), token(Token::LPar, 1));
596 assert_eq!(lex.next(), token(Token::RPar, 1));
597 assert_eq!(lex.next(), token(Token::Comment("; hello"), 1));
598 assert_eq!(lex.next(), token(Token::Equal, 2));
599 assert_eq!(lex.next(), token(Token::Colon, 2));
600 assert_eq!(lex.next(), token(Token::LBrace, 2));
601 assert_eq!(lex.next(), token(Token::Comma, 2));
602 assert_eq!(lex.next(), token(Token::RBrace, 2));
603 assert_eq!(lex.next(), token(Token::Dot, 2));
604 assert_eq!(lex.next(), None);
605 }
606
607 #[test]
608 fn lex_numbers() {
609 let mut lex = Lexer::new(" 0 2_000 -1,0xf -0x0 0.0 0x0.4p-34 NaN +5");
610 assert_eq!(lex.next(), token(Token::Integer("0"), 1));
611 assert_eq!(lex.next(), token(Token::Integer("2_000"), 1));
612 assert_eq!(lex.next(), token(Token::Integer("-1"), 1));
613 assert_eq!(lex.next(), token(Token::Comma, 1));
614 assert_eq!(lex.next(), token(Token::Integer("0xf"), 1));
615 assert_eq!(lex.next(), token(Token::Integer("-0x0"), 1));
616 assert_eq!(lex.next(), token(Token::Float("0.0"), 1));
617 assert_eq!(lex.next(), token(Token::Float("0x0.4p-34"), 1));
618 assert_eq!(lex.next(), token(Token::Float("NaN"), 1));
619 assert_eq!(lex.next(), token(Token::Integer("+5"), 1));
620 assert_eq!(lex.next(), None);
621 }
622
623 #[test]
624 fn lex_identifiers() {
625 let mut lex = Lexer::new(
626 "v0 v00 vx01 block1234567890 block5234567890 v1x vx1 vxvx4 \
627 function0 function i8 i32x4 f32x5 f16 f128",
628 );
629 assert_eq!(
630 lex.next(),
631 token(Token::Value(Value::with_number(0).unwrap()), 1)
632 );
633 assert_eq!(lex.next(), token(Token::Identifier("v00"), 1));
634 assert_eq!(lex.next(), token(Token::Identifier("vx01"), 1));
635 assert_eq!(
636 lex.next(),
637 token(Token::Block(Block::with_number(1234567890).unwrap()), 1)
638 );
639 assert_eq!(lex.next(), token(Token::Identifier("block5234567890"), 1));
640 assert_eq!(lex.next(), token(Token::Identifier("v1x"), 1));
641 assert_eq!(lex.next(), token(Token::Identifier("vx1"), 1));
642 assert_eq!(lex.next(), token(Token::Identifier("vxvx4"), 1));
643 assert_eq!(lex.next(), token(Token::Identifier("function0"), 1));
644 assert_eq!(lex.next(), token(Token::Identifier("function"), 1));
645 assert_eq!(lex.next(), token(Token::Type(types::I8), 1));
646 assert_eq!(lex.next(), token(Token::Type(types::I32X4), 1));
647 assert_eq!(lex.next(), token(Token::Identifier("f32x5"), 1));
648 assert_eq!(lex.next(), token(Token::Type(types::F16), 1));
649 assert_eq!(lex.next(), token(Token::Type(types::F128), 1));
650 assert_eq!(lex.next(), None);
651 }
652
653 #[test]
654 fn lex_hex_sequences() {
655 let mut lex = Lexer::new("#0 #DEADbeef123 #789");
656
657 assert_eq!(lex.next(), token(Token::HexSequence("0"), 1));
658 assert_eq!(lex.next(), token(Token::HexSequence("DEADbeef123"), 1));
659 assert_eq!(lex.next(), token(Token::HexSequence("789"), 1));
660 }
661
662 #[test]
663 fn lex_names() {
664 let mut lex = Lexer::new("%0 %x3 %function %123_abc %ss0 %v3 %block11 %const42 %_");
665
666 assert_eq!(lex.next(), token(Token::Name("0"), 1));
667 assert_eq!(lex.next(), token(Token::Name("x3"), 1));
668 assert_eq!(lex.next(), token(Token::Name("function"), 1));
669 assert_eq!(lex.next(), token(Token::Name("123_abc"), 1));
670 assert_eq!(lex.next(), token(Token::Name("ss0"), 1));
671 assert_eq!(lex.next(), token(Token::Name("v3"), 1));
672 assert_eq!(lex.next(), token(Token::Name("block11"), 1));
673 assert_eq!(lex.next(), token(Token::Name("const42"), 1));
674 assert_eq!(lex.next(), token(Token::Name("_"), 1));
675 }
676
677 #[test]
678 fn lex_strings() {
679 let mut lex = Lexer::new(
680 r#""" "0" "x3""function" "123 abc" "\" "start
681 and end on
682 different lines" "#,
683 );
684
685 assert_eq!(lex.next(), token(Token::String(""), 1));
686 assert_eq!(lex.next(), token(Token::String("0"), 1));
687 assert_eq!(lex.next(), token(Token::String("x3"), 1));
688 assert_eq!(lex.next(), token(Token::String("function"), 1));
689 assert_eq!(lex.next(), token(Token::String("123 abc"), 1));
690 assert_eq!(lex.next(), token(Token::String(r#"\"#), 1));
691 assert_eq!(
692 lex.next(),
693 token(
694 Token::String(
695 r#"start
696 and end on
697 different lines"#
698 ),
699 1
700 )
701 );
702 }
703
704 #[test]
705 fn lex_userrefs() {
706 let mut lex = Lexer::new("u0 u1 u234567890 u9:8765");
707
708 assert_eq!(lex.next(), token(Token::UserRef(0), 1));
709 assert_eq!(lex.next(), token(Token::UserRef(1), 1));
710 assert_eq!(lex.next(), token(Token::UserRef(234567890), 1));
711 assert_eq!(lex.next(), token(Token::UserRef(9), 1));
712 assert_eq!(lex.next(), token(Token::Colon, 1));
713 assert_eq!(lex.next(), token(Token::Integer("8765"), 1));
714 assert_eq!(lex.next(), None);
715 }
716}