From e5b15bf93f36a974500a470847e611e019205cde Mon Sep 17 00:00:00 2001 From: simon petit Date: Mon, 25 Nov 2024 10:36:33 +0000 Subject: [PATCH 1/3] Allowing cascading joins --- lib/ast.ml | 19 +++++++++++-- parser/parser.mly | 70 +++++++++++++++++++++++----------------------- test/SQL_parser.ml | 46 ++++++++++++++++++++++++++++-- 3 files changed, 96 insertions(+), 39 deletions(-) diff --git a/lib/ast.ml b/lib/ast.ml index 3f37dc3..5debe2f 100644 --- a/lib/ast.ml +++ b/lib/ast.ml @@ -6,7 +6,7 @@ and column = | Column of string and table = | Table of string - | Join of table * join_type * table + | Join of table * join_type * table * condition option and join_type = | Inner | Left @@ -15,4 +15,19 @@ and join_type = | Cross | Union | Natural - +and condition = + | Condition of string * comparison + | And of condition * condition + | Or of condition * condition + | Not of condition +and comparison = + | Comparison of operator * string +and operator = + | Equals + | NotEquals + | LessThan + | GreaterThan + | LessEquals + | GreaterEquals +and search_condition = + | Search of string diff --git a/parser/parser.mly b/parser/parser.mly index 2f2afc9..184c70b 100644 --- a/parser/parser.mly +++ b/parser/parser.mly @@ -53,7 +53,6 @@ table_reference : (* | table_primary_or_joined_table sample_clause { $1 } *) table_primary_or_joined_table: - (* | table_primary { Table($1) }*) | table_primary { $1 } | joined_table { $1 } @@ -70,24 +69,24 @@ joined_table : | union_join { $1 } cross_join: - | table_reference CROSS JOIN table_primary { Join($1, Cross, $4) } + | table_reference CROSS JOIN table_primary { Join($1, Cross, $4, None) } qualified_join: - | table_reference JOIN table_reference join_specification { Join($1, Left, $3) } - | table_reference join_type JOIN table_reference join_specification { Join($1, $2, $4) } + | table_reference JOIN table_reference join_specification { Join($1, Left, $3, $4) } + | table_reference join_type JOIN table_reference join_specification { Join($1, $2, $4, $5) } join_specification: - | join_condition {} + | join_condition { $1 } join_condition: - | ON search_condition {} + | ON search_condition { Some($2) } natural_join: - | table_reference NATURAL JOIN table_primary { Join($1, Natural, $4) } - | table_reference NATURAL join_type JOIN table_primary { Join($1, Natural, $5) } + | table_reference NATURAL JOIN table_primary { Join($1, Natural, $4, None) } + | table_reference NATURAL join_type JOIN table_primary { Join($1, Natural, $5, None) } union_join: - | table_reference UNION JOIN table_primary { Join($1,Union, $4) } + | table_reference UNION JOIN table_primary { Join($1, Union, $4, None) } table_name : | IDENT { Table($1) } @@ -107,43 +106,44 @@ where_clause : | WHERE search_condition { } search_condition: - | IDENT EQUALS_OPERATOR IDENT {} + (*| IDENT EQUALS_OPERATOR IDENT {}*) + | boolean_value_expression { $1 } boolean_value_expression: - | boolean_term {} - | boolean_value_expression OR boolean_term {} + | boolean_term { $1 } + | boolean_value_expression OR boolean_term { Or($1, $3) } boolean_term: - | boolean_factor {} - | boolean_term AND boolean_factor {} + | boolean_factor { $1 } + | boolean_term AND boolean_factor { And($1, $3) } boolean_factor: - | boolean_test {} - | NOT boolean_test {} + | boolean_test { $1 } + | NOT boolean_test { Not($2) } boolean_test: - | boolean_primary {} + | boolean_primary { $1 } boolean_primary : - | predicate {} - | boolean_predicand {} + | predicate { $1 } + (*| boolean_predicand {}*) predicate : - | comparison_predicate {} + | comparison_predicate { $1 } comparison_predicate : - | row_value_predicand comparison_predicate_part2 {} + | row_value_predicand comparison_predicate_part2 { Condition($1, $2) } comparison_predicate_part2: - | comp_op row_value_predicand {} + | comp_op row_value_predicand { Comparison($1, $2) } comp_op : - | EQUALS_OPERATOR {} - | not_equals_operator {} - | LESS_THAN_OPERATOR {} - | GREATER_THAN_OPERATOR {} - | less_than_or_equals_operator {} - | greater_than_or_equals_operator {} + | EQUALS_OPERATOR { Equals } + | not_equals_operator { NotEquals } + | LESS_THAN_OPERATOR { LessThan } + | GREATER_THAN_OPERATOR { GreaterThan } + | less_than_or_equals_operator { LessEquals } + | greater_than_or_equals_operator { GreaterEquals } not_equals_operator : | LESS_THAN_OPERATOR GREATER_THAN_OPERATOR {} @@ -155,23 +155,23 @@ greater_than_or_equals_operator: | GREATER_THAN_OPERATOR EQUALS_OPERATOR {} row_value_predicand: - | row_value_special_case {} + | row_value_special_case { $1 } row_value_special_case : - | nonparenthesized_value_expression_primary {} + | nonparenthesized_value_expression_primary { $1 } nonparenthesized_value_expression_primary: - | column_reference {} + | column_reference { $1 } column_reference: - | basic_identifier_chain {} + | basic_identifier_chain { $1 } basic_identifier_chain: - | identifier_chain {} + | identifier_chain { $1 } identifier_chain: - | IDENT {} - | identifier_chain DOT IDENT {} + | IDENT { $1 } + (*| identifier_chain DOT IDENT {}*) boolean_predicand: | nonparenthesized_value_expression_primary {} diff --git a/test/SQL_parser.ml b/test/SQL_parser.ml index a09f19c..eb1d4c3 100644 --- a/test/SQL_parser.ml +++ b/test/SQL_parser.ml @@ -7,5 +7,47 @@ let parse query = let () = assert(parse "SELECT ab FROM b1" = Query(Select([Column("ab")], [Table "b1"]))); assert(parse "SELECT * FROM b1" = Query(Select([Asterisk], [Table "b1"]))); - assert(parse "SELECT * FROM t1 CROSS JOIN t2" = Query(Select([Asterisk], [Join(Table("t1"), Cross, Table("t2"))]))); - assert(parse "SELECT * FROM t1 JOIN t2 ON a = b" = Query(Select([Asterisk], [Join(Table("t1"), Left, Table("t2"))]))); + assert(parse "SELECT * FROM t1 CROSS JOIN t2" = Query(Select([Asterisk], [Join(Table("t1"), Cross, Table("t2"), None)]))); + assert(parse "SELECT * FROM t1 JOIN t2 ON a = b" = Query( + Select([Asterisk], [ + Join( + Table("t1"), + Left, + Table("t2"), + Some( + Condition( + "a", + Comparison(Equals, "b") + ) + ) + ) + ] + ) + )); + assert(parse "SELECT * FROM t1 JOIN t2 ON a = b JOIN t3 ON c = d" = Query( + Select([Asterisk], [ + Join( + Join( + Table("t1"), + Left, + Table("t2"), + Some( + Condition( + "a", + Comparison(Equals, "b") + ) + ) + ), + Left, + Table("t3"), + Some( + Condition( + "c", + Comparison(Equals, "d") + ) + ) + ) + ] + ) + ) + ); From d7ee32a94120512c08c1be526f1c2d62c2b9bbd2 Mon Sep 17 00:00:00 2001 From: simon petit Date: Mon, 25 Nov 2024 16:52:10 +0000 Subject: [PATCH 2/3] WIP logical plan --- lexer/lexer.mll | 11 +++++- lib/.files.ml.swp | Bin 0 -> 12288 bytes lib/.logical_plan.ml.swp | Bin 0 -> 12288 bytes lib/dune | 5 +++ lib/files.ml | 1 + lib/logical_plan.ml | 27 +++++++++++++ lib/physical_plan.ml | 0 parser/parser.mly | 80 ++++++++++++++++++++++++++++++++++++-- test/SQL_parser.ml | 1 + test/dune | 4 ++ test/logical_plan_test.ml | 14 +++++++ 11 files changed, 139 insertions(+), 4 deletions(-) create mode 100644 lib/.files.ml.swp create mode 100644 lib/.logical_plan.ml.swp create mode 100644 lib/files.ml create mode 100644 lib/logical_plan.ml create mode 100644 lib/physical_plan.ml create mode 100644 test/logical_plan_test.ml diff --git a/lexer/lexer.mll b/lexer/lexer.mll index 61505db..4116cb7 100644 --- a/lexer/lexer.mll +++ b/lexer/lexer.mll @@ -9,6 +9,11 @@ let alphanumeric = (alpha|digit) rule read_token = parse | "SELECT" { SELECT } + | "AVG" { AVG } + | "MAX" { MAX } + | "MIN" { MIN } + | "SUM" { SUM } + | "COUNT" { COUNT } | "DISTINCT" { DISTINCT } | "FROM" { FROM } | "LEFT" { LEFT } @@ -17,11 +22,15 @@ rule read_token = parse | "UNION" { UNION } | "JOIN" { JOIN } | "ON" { ON } + | "GROUP" { GROUP } + | "BY" { BY } | "*" { ASTERISK } | "." { DOT } | "=" { EQUALS_OPERATOR } + | "(" { LEFT_PAREN } + | ")" { RIGHT_PAREN } + | "," { COMMA } | whitespace { read_token lexbuf } | "WHERE" { WHERE } | alpha alphanumeric* as ident { IDENT ident } - | "," { COMMA } | eof { EOF } diff --git a/lib/.files.ml.swp b/lib/.files.ml.swp new file mode 100644 index 0000000000000000000000000000000000000000..4c8a1520ef33323584158b15ed829eac4c727db3 GIT binary patch literal 12288 zcmeI&F;2rU6oBCuc2opCfPp;;Wnt+=7Y4?%MsDIlB&P|riC}^Ia2rm*#4!-qh)N9Y z(uqp{ldQxq@ss^+mOaaFuCCQZI+73vBKhm&cF+ywbC`&$n|6_Q#@9Z6-5B$-)lZe_ z?PFin`A?;_|33dIm`&R9p)m{ZvbNNXE!@2=I=ysenb~@l&1-91+Ss*)NFsp176SXz z^U+xz2cDiBtE0o|7Bv`x00IagfB*srAbeIW7`T0H+>?f+x{EwqoAClEjY0R#|0009ILKmY**5J2F6 l2`FWaSC#f!jn$>=Q`?3;e$2&}qIA8Ack9Dldw-|K@(n$KJof+q literal 0 HcmV?d00001 diff --git a/lib/.logical_plan.ml.swp b/lib/.logical_plan.ml.swp new file mode 100644 index 0000000000000000000000000000000000000000..061fd1dd26f789ae5c4d330cac8cf28b27e45366 GIT binary patch literal 12288 zcmeI2&2AGh5XT)(94J)K8wZAP*+ls$MM%8_R4GCNi32L-QbjAr>rGsFz3Hwu5GjHy z2VMb*j}s5T3&4eYkGuk}fW+9l4Gjv=Yhf(?W7+=CjQ#UoR@vd^llu>0yR*fxtut2p z&>!`$ZLsefHI}O+HJypH%DUCxwpgTm7Rjl8HjUJ#&$QC}{6xZTKNUqHjm^8M;E4|N zLmf_dmdmWG)v&A6krEuIG&h|@ugpi2M1TmKl0aeO_VyaPwRLmTMck-ghbx!&P8m!# zB0vO)01+SpM1Tko0U|&I{sjVdQe!XhAm^&XT&?=rFMTznjtCF|B0vO)01+SpM1Tko z0U|&IhyW2dg9LcQ*vAWu_119n`2YX$`~SB~jD119Mnx!rx{GR~&ZCYlGWHqu3H1i` z3Z+pysN1M@6hnQ*yho__sCTHhsF$cIYKZc6&mcG2G7%sGM1Tko0U|&IhyW2F0{I;sJ}u-nD}*gF=?c2mV7|kjku)-MY2z$SLZNw)OW4oUXlw@y(?Muw z6WLmfR|jbs+rhEvL8?rE0dv^j$!*6EouLI)GVnE85H<$YsnPn_Q3woAWI5~6_#vh$ zsrVD+B^Pd@)d#3d2nc)|^1K9BjUCX%9Q6K}L>Ua4gFD5eLT0mIS86G25VoK!(^$SX zo?E{=@Yj|}@iY6$@7BTj{08UX=s+67(t6XEJisMSADYX{xT&ByhkHs}nXL#Gsfm5Q-iT literal 0 HcmV?d00001 diff --git a/lib/dune b/lib/dune index 84b7fb2..699c6d1 100644 --- a/lib/dune +++ b/lib/dune @@ -1,3 +1,8 @@ (library (modules ast) (name ast)) + +(library + (modules logical_plan) + (libraries ast) + (name logical_plan)) diff --git a/lib/files.ml b/lib/files.ml new file mode 100644 index 0000000..10a1994 --- /dev/null +++ b/lib/files.ml @@ -0,0 +1 @@ +let load_csv file_path = diff --git a/lib/logical_plan.ml b/lib/logical_plan.ml new file mode 100644 index 0000000..af0ea1b --- /dev/null +++ b/lib/logical_plan.ml @@ -0,0 +1,27 @@ +type logical_plan = + | Scan of string (* Table name *) + (*| Filter of logical_plan * condition*) + | Join of logical_plan * Ast.join_type * logical_plan + +let rec generate_logical_plan ast = + match ast with + | Ast.Query(Select(_, tables)) -> + let base_plan = generate_from_clause tables in + base_plan + +and generate_from_clause tables = + match tables with + | [Table(name)] -> Scan(name) + | [Ast.Join(left, j_type, right, _)] -> + Join( + generate_from_clause [left], + j_type, + generate_from_clause [right] + ) + | _ -> failwith "Unsupported table structure" + + +let evaluate_plan plan = + match plan with + | Scan(table) -> + | _ -> failwith "Unsupported plan" diff --git a/lib/physical_plan.ml b/lib/physical_plan.ml new file mode 100644 index 0000000..e69de29 diff --git a/parser/parser.mly b/parser/parser.mly index 184c70b..c72b6d4 100644 --- a/parser/parser.mly +++ b/parser/parser.mly @@ -7,10 +7,12 @@ open Ast %token LEFT RIGHT FULL INNER OUTER %token CROSS NATURAL UNION JOIN %token GREATER_THAN_OPERATOR LESS_THAN_OPERATOR EQUALS_OPERATOR +%token MAX MIN SUM COUNT AVG %token IDENT %token COMMA DOT +%token LEFT_PAREN RIGHT_PAREN %token ASTERISK -%token AS ON +%token AS ON GROUP BY FILTER %token OR AND NOT %token EOF %start main @@ -23,9 +25,9 @@ main: select_stmt : | SELECT select_list table_expression { Select($2, $3) } - | SELECT set_identifier select_list table_expression { Select($3, $4) } + | SELECT set_quantifier select_list table_expression { Select($3, $4) } -set_identifier : +set_quantifier : | ALL {} | DISTINCT {} @@ -37,6 +39,33 @@ select_sublist : | IDENT { [Column($1)] } | select_sublist COMMA IDENT { Column($3)::$1 } +derived_column: + | value_expression {} + | value_expression as_clause {} + +as_clause : + | AS column_name {} + | column_name {} + +column_name : + | IDENT {} + +value_expression: + | common_value_expression {} + +common_value_expression: + | reference_value_expression {} + +reference_value_expression: + | value_expression_primary {} + +value_expression_primary: + | parenthesized_value_expression {} + | nonparenthesized_value_expression_primary {} + +parenthesized_value_expression: + | LEFT_PAREN value_expression RIGHT_PAREN {} + table_expression: | from_clause { $1 } | from_clause where_clause { $1 } @@ -162,6 +191,33 @@ row_value_special_case : nonparenthesized_value_expression_primary: | column_reference { $1 } +(* | set_function_specification { $1 }*) + +set_function_specification: + | aggregate_function { $1 } + +aggregate_function: + | COUNT LEFT_PAREN ASTERISK RIGHT_PAREN { Asterisk } + | COUNT LEFT_PAREN ASTERISK RIGHT_PAREN filter_clause { Asterisk } + | general_set_function { $1 } + | general_set_function filter_clause { $1 } + +general_set_function: + | set_function_type LEFT_PAREN value_expression RIGHT_PAREN { $3 } + | set_function_type LEFT_PAREN set_quantifier value_expression RIGHT_PAREN { $4 } + +set_function_type: + | computationnal_operation {} + +computationnal_operation: + | AVG {} + | MAX {} + | MIN {} + | SUM {} + | COUNT {} + +filter_clause : + | FILTER LEFT_PAREN WHERE search_condition RIGHT_PAREN {} column_reference: | basic_identifier_chain { $1 } @@ -175,3 +231,21 @@ identifier_chain: boolean_predicand: | nonparenthesized_value_expression_primary {} + +group_by_clause: + | GROUP BY grouping_element_list {} + | GROUP BY set_quantifier grouping_element_list {} + +grouping_element_list : + | grouping_element {} + | grouping_element_list COMMA grouping_element_list {} + +grouping_element: + | ordinary_grouping_set {} + +ordinary_grouping_set : + | grouping_column_reference {} + +grouping_column_reference: + | column_reference {} + (*| column_reference collate_clause {}*) diff --git a/test/SQL_parser.ml b/test/SQL_parser.ml index eb1d4c3..a2afa92 100644 --- a/test/SQL_parser.ml +++ b/test/SQL_parser.ml @@ -6,6 +6,7 @@ let parse query = let () = assert(parse "SELECT ab FROM b1" = Query(Select([Column("ab")], [Table "b1"]))); + assert(parse "SELECT ab FROM test" = Query(Select([Column("ab")], [Table "test"]))); assert(parse "SELECT * FROM b1" = Query(Select([Asterisk], [Table "b1"]))); assert(parse "SELECT * FROM t1 CROSS JOIN t2" = Query(Select([Asterisk], [Join(Table("t1"), Cross, Table("t2"), None)]))); assert(parse "SELECT * FROM t1 JOIN t2 ON a = b" = Query( diff --git a/test/dune b/test/dune index 20beef1..96751a9 100644 --- a/test/dune +++ b/test/dune @@ -1,3 +1,7 @@ (test (name SQL_parser) (libraries parser lexer ast)) + +(test + (name logical_plan_test) + (libraries ast logical_plan)) diff --git a/test/logical_plan_test.ml b/test/logical_plan_test.ml new file mode 100644 index 0000000..9e1ffea --- /dev/null +++ b/test/logical_plan_test.ml @@ -0,0 +1,14 @@ +open Ast + +let () = + let ast1 = Query(Select([Column("ab")], [Table "b1"])) in + assert( Logical_plan.generate_logical_plan ast1 = Logical_plan.Scan("b1")); + let ast2 = Query(Select([Asterisk], [Join(Table("t1"), Cross, Table("t2"), None)])) in + assert(Logical_plan.generate_logical_plan ast2 = + Logical_plan.Join( + Logical_plan.Scan("t1"), + Cross, + Logical_plan.Scan("t2") + ) + ); + From 2f0eacf381753b3803566fc3c5c2b1770f00510c Mon Sep 17 00:00:00 2001 From: simon petit Date: Mon, 25 Nov 2024 17:13:27 +0000 Subject: [PATCH 3/3] draft to read csv --- lib/.files.ml.swp | Bin 12288 -> 0 bytes lib/.logical_plan.ml.swp | Bin 12288 -> 0 bytes lib/csv.ml | 11 +++++++++++ lib/files.ml | 1 - 4 files changed, 11 insertions(+), 1 deletion(-) delete mode 100644 lib/.files.ml.swp delete mode 100644 lib/.logical_plan.ml.swp create mode 100644 lib/csv.ml diff --git a/lib/.files.ml.swp b/lib/.files.ml.swp deleted file mode 100644 index 4c8a1520ef33323584158b15ed829eac4c727db3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12288 zcmeI&F;2rU6oBCuc2opCfPp;;Wnt+=7Y4?%MsDIlB&P|riC}^Ia2rm*#4!-qh)N9Y z(uqp{ldQxq@ss^+mOaaFuCCQZI+73vBKhm&cF+ywbC`&$n|6_Q#@9Z6-5B$-)lZe_ z?PFin`A?;_|33dIm`&R9p)m{ZvbNNXE!@2=I=ysenb~@l&1-91+Ss*)NFsp176SXz z^U+xz2cDiBtE0o|7Bv`x00IagfB*srAbeIW7`T0H+>?f+x{EwqoAClEjY0R#|0009ILKmY**5J2F6 l2`FWaSC#f!jn$>=Q`?3;e$2&}qIA8Ack9Dldw-|K@(n$KJof+q diff --git a/lib/.logical_plan.ml.swp b/lib/.logical_plan.ml.swp deleted file mode 100644 index 061fd1dd26f789ae5c4d330cac8cf28b27e45366..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12288 zcmeI2&2AGh5XT)(94J)K8wZAP*+ls$MM%8_R4GCNi32L-QbjAr>rGsFz3Hwu5GjHy z2VMb*j}s5T3&4eYkGuk}fW+9l4Gjv=Yhf(?W7+=CjQ#UoR@vd^llu>0yR*fxtut2p z&>!`$ZLsefHI}O+HJypH%DUCxwpgTm7Rjl8HjUJ#&$QC}{6xZTKNUqHjm^8M;E4|N zLmf_dmdmWG)v&A6krEuIG&h|@ugpi2M1TmKl0aeO_VyaPwRLmTMck-ghbx!&P8m!# zB0vO)01+SpM1Tko0U|&I{sjVdQe!XhAm^&XT&?=rFMTznjtCF|B0vO)01+SpM1Tko z0U|&IhyW2dg9LcQ*vAWu_119n`2YX$`~SB~jD119Mnx!rx{GR~&ZCYlGWHqu3H1i` z3Z+pysN1M@6hnQ*yho__sCTHhsF$cIYKZc6&mcG2G7%sGM1Tko0U|&IhyW2F0{I;sJ}u-nD}*gF=?c2mV7|kjku)-MY2z$SLZNw)OW4oUXlw@y(?Muw z6WLmfR|jbs+rhEvL8?rE0dv^j$!*6EouLI)GVnE85H<$YsnPn_Q3woAWI5~6_#vh$ zsrVD+B^Pd@)d#3d2nc)|^1K9BjUCX%9Q6K}L>Ua4gFD5eLT0mIS86G25VoK!(^$SX zo?E{=@Yj|}@iY6$@7BTj{08UX=s+67(t6XEJisMSADYX{xT&ByhkHs}nXL#Gsfm5Q-iT diff --git a/lib/csv.ml b/lib/csv.ml new file mode 100644 index 0000000..1e512ee --- /dev/null +++ b/lib/csv.ml @@ -0,0 +1,11 @@ +let load path delimiter header filter = + let ic = open_in path in + let rec aux acc = + try + let line = input_line ic + + let rows = [] + let line = read_line ic in + let columns = String.split_on_char delimiter line in + + diff --git a/lib/files.ml b/lib/files.ml index 10a1994..e69de29 100644 --- a/lib/files.ml +++ b/lib/files.ml @@ -1 +0,0 @@ -let load_csv file_path =