From: Martin Quinson Date: Mon, 3 Sep 2018 07:20:56 +0000 (+0200) Subject: Somehow fix the killing of actors in Java X-Git-Tag: v3_21~118 X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/d535c50c617ba838b99de4bd251a6ac076774d00?hp=609ba683d48cc9eb7570548635e0b6218c57a747 Somehow fix the killing of actors in Java Things are somehow fixed, as all tests seem to pass, but the situation is still very messy after this commit. Contents: - Reimplement ContextJava as subclass of ContextThread to reduce duplication. - Don't send the StopRequest exception on host failure if we are in Java because *some* of the actors don't catch it well, resulting in simulation failure. - Forcefully kill the process ("exit(0)" in C) after MSG_run() because dead actors are sometimes not completely killed, preventing the simulation from ending. See the comment in ActorImpl for a better understanding of this mess and how to fix it in the future. --- diff --git a/ChangeLog b/ChangeLog index c3697b7a23..2d67518536 100644 --- a/ChangeLog +++ b/ChangeLog @@ -23,6 +23,10 @@ XBT: - Remove portability wrapper to condition variables - Remove xbt_os_thread_yield() +Java: + - Due to an internal bug, Msg.run() must now be your last line. + We hope to fix it in a future release, and we are sorry for the inconvenience. + Fixed bugs: - #22: Process autorestart seem to only work with CAS01 cpus - #93: simgrid should not eat --help diff --git a/examples/java/app/bittorrent/app-bittorrent.tesh b/examples/java/app/bittorrent/app-bittorrent.tesh index d5957e7d99..c9608c654e 100644 --- a/examples/java/app/bittorrent/app-bittorrent.tesh +++ b/examples/java/app/bittorrent/app-bittorrent.tesh @@ -5,7 +5,7 @@ $ java -classpath ${classpath:=.} app/bittorrent/Main ${srcdir:=.}/../platforms/cluster.xml ${srcdir:=.}/app/bittorrent/bittorrent.xml > [0.000000] [java/INFO] Using regular java threads. -> [5000.046836] [java/INFO] MSG_main finished; Cleaning up the simulation... +> [5000.046836] [java/INFO] MSG_main finished; Terminating the simulation... > [node-0.acme.org:app.bittorrent.Tracker:(1) 0.000000] [java/INFO] Tracker launched. > [node-0.acme.org:app.bittorrent.Tracker:(1) 3000.000000] [java/INFO] Tracker is leaving > [node-1.acme.org:app.bittorrent.Peer:(2) 0.000000] [java/INFO] Hi, I'm joining the network with id 2 diff --git a/examples/java/app/centralizedmutex/app-centralizedmutex.tesh b/examples/java/app/centralizedmutex/app-centralizedmutex.tesh index ecacf3983e..5496f03f0d 100644 --- a/examples/java/app/centralizedmutex/app-centralizedmutex.tesh +++ b/examples/java/app/centralizedmutex/app-centralizedmutex.tesh @@ -11,4 +11,4 @@ $ java -classpath ${classpath:=.} app/centralizedmutex/Main ${srcdir:=.}/../plat > [Fafard:app.centralizedmutex.Node:(3) 0.063737] [java/INFO] Wait for a grant from the coordinator > [Tremblay:app.centralizedmutex.Coordinator:(1) 0.063737] [java/INFO] Got a request from app.centralizedmutex.Node. Queue empty: grant it > [Tremblay:app.centralizedmutex.Coordinator:(1) 0.134167] [java/INFO] we should shutdown the simulation now -> [0.134167] [java/INFO] MSG_main finished; Cleaning up the simulation... +> [0.134167] [java/INFO] MSG_main finished; Terminating the simulation... diff --git a/examples/java/app/masterworker/app-masterworker.tesh b/examples/java/app/masterworker/app-masterworker.tesh index 4a30a5ef3a..ea836e1cc2 100644 --- a/examples/java/app/masterworker/app-masterworker.tesh +++ b/examples/java/app/masterworker/app-masterworker.tesh @@ -17,4 +17,4 @@ $ java -classpath ${classpath:=.} app/masterworker/Main ${srcdir:=.}/../platform > [ 5.628842] (7:app.masterworker.Worker@Jupiter) Received Finalize. I'm done. See you! > [ 5.629037] (8:app.masterworker.Worker@Jacquelin) Received Finalize. I'm done. See you! > [ 5.629037] (1:app.masterworker.Master@Jacquelin) Goodbye now! -> [ 5.629037] (0:maestro@) MSG_main finished; Cleaning up the simulation... +> [ 5.629037] (0:maestro@) MSG_main finished; Terminating the simulation... diff --git a/examples/java/app/pingpong/app-pingpong.tesh b/examples/java/app/pingpong/app-pingpong.tesh index de0f5e8c65..003140bfee 100644 --- a/examples/java/app/pingpong/app-pingpong.tesh +++ b/examples/java/app/pingpong/app-pingpong.tesh @@ -20,4 +20,4 @@ $ java -classpath ${classpath:=.} app/pingpong/Main ${srcdir:=.}/../platforms/sm > [Boivin:Receiver:(2) 3.146646] [java/INFO] --- bw 9.533962169004266E7 ---- > [Boivin:Receiver:(2) 3.146646] [java/INFO] Done. > [Jacquelin:Sender:(1) 3.146646] [java/INFO] Done. -> [3.146646] [java/INFO] MSG_main finished; Cleaning up the simulation... +> [3.146646] [java/INFO] MSG_main finished; Terminating the simulation... diff --git a/examples/java/app/tokenring/Main.java b/examples/java/app/tokenring/Main.java index 648339c3e2..e4d0f633a2 100644 --- a/examples/java/app/tokenring/Main.java +++ b/examples/java/app/tokenring/Main.java @@ -29,7 +29,5 @@ class Main { } Msg.info("Number of hosts '"+hosts.length+"'"); Msg.run(); - - Msg.info("Simulation time "+Msg.getClock()); } } diff --git a/examples/java/app/tokenring/app-tokenring.tesh b/examples/java/app/tokenring/app-tokenring.tesh index cef02614b7..c7ed48bba9 100644 --- a/examples/java/app/tokenring/app-tokenring.tesh +++ b/examples/java/app/tokenring/app-tokenring.tesh @@ -15,8 +15,7 @@ $ java -classpath ${classpath:=.} app/tokenring/Main ${srcdir:=.}/../platforms/r > [ 0.101019] (6:5@host6) Host '5' received 'Token' > [ 0.101019] (6:5@host6) Host '5' send 'Token' to Host '0' > [ 0.131796] (1:0@host1) Host '0' received 'Token' -> [ 0.131796] (0:maestro@) MSG_main finished; Cleaning up the simulation... -> [ 0.131796] (0:maestro@) Simulation time 0.13179602061855672 +> [ 0.131796] (0:maestro@) MSG_main finished; Terminating the simulation... $ java -classpath ${classpath:=.} app/tokenring/Main ${srcdir:=.}/../platforms/two_peers.xml '--log=root.fmt:[%12.6r]%e(%i:%P@%h)%e%m%n' > [ 0.000000] (0:maestro@) Using regular java threads. @@ -25,8 +24,7 @@ $ java -classpath ${classpath:=.} app/tokenring/Main ${srcdir:=.}/../platforms/t > [ 0.624423] (2:1@100036570) Host '1' received 'Token' > [ 0.624423] (2:1@100036570) Host '1' send 'Token' to Host '0' > [ 1.248846] (1:0@100030591) Host '0' received 'Token' -> [ 1.248846] (0:maestro@) MSG_main finished; Cleaning up the simulation... -> [ 1.248846] (0:maestro@) Simulation time 1.2488464578972847 +> [ 1.248846] (0:maestro@) MSG_main finished; Terminating the simulation... $ java -classpath ${classpath:=.} app/tokenring/Main ${srcdir:=.}/../platforms/meta_cluster.xml '--log=root.fmt:[%10.6r]%e(%i:%P@%h)%e%m%n' > [ 0.000000] (0:maestro@) Using regular java threads. @@ -151,5 +149,4 @@ $ java -classpath ${classpath:=.} app/tokenring/Main ${srcdir:=.}/../platforms/m > [ 1.791501] (60:59@host-9.cluster2) Host '59' received 'Token' > [ 1.791501] (60:59@host-9.cluster2) Host '59' send 'Token' to Host '0' > [ 1.821865] (1:0@host-1.cluster1) Host '0' received 'Token' -> [ 1.821865] (0:maestro@) MSG_main finished; Cleaning up the simulation... -> [ 1.821865] (0:maestro@) Simulation time 1.8218653608247406 +> [ 1.821865] (0:maestro@) MSG_main finished; Terminating the simulation... diff --git a/examples/java/async/dsend/async-dsend.tesh b/examples/java/async/dsend/async-dsend.tesh index 60b948d338..be3fc5e2ec 100644 --- a/examples/java/async/dsend/async-dsend.tesh +++ b/examples/java/async/dsend/async-dsend.tesh @@ -24,4 +24,4 @@ $ java -classpath ${classpath:=.} async/dsend/Main ${srcdir:=.}/../platforms/sma > [ 2.964768] (3:Receiver@Fafard) Received a task. I'm done. See you! > [ 4.162002] (5:Receiver@Jacquelin) Received a task. I'm done. See you! > [ 20.000000] (1:Sender@Boivin) Done sleeping. Goodbye now! -> [ 20.000000] (0:maestro@) MSG_main finished; Cleaning up the simulation... +> [ 20.000000] (0:maestro@) MSG_main finished; Terminating the simulation... diff --git a/examples/java/async/waitall/async-waitall.tesh b/examples/java/async/waitall/async-waitall.tesh index 95f31eeb02..e9aba7eea3 100644 --- a/examples/java/async/waitall/async-waitall.tesh +++ b/examples/java/async/waitall/async-waitall.tesh @@ -23,4 +23,4 @@ $ java -classpath ${classpath:=.} async/waitall/Main ${srcdir:=.}/../platforms/s > [ 2.964768] (3:Receiver@Fafard) I got my task, good bye. > [ 4.162002] (5:Receiver@Jacquelin) I got my task, good bye. > [ 4.162002] (1:Sender@Boivin) Goodbye now! -> [ 4.162002] (0:maestro@) MSG_main finished; Cleaning up the simulation... +> [ 4.162002] (0:maestro@) MSG_main finished; Terminating the simulation... diff --git a/examples/java/async/yield/async-yield.tesh b/examples/java/async/yield/async-yield.tesh index de936b4b61..9c7526d4ad 100644 --- a/examples/java/async/yield/async-yield.tesh +++ b/examples/java/async/yield/async-yield.tesh @@ -5,4 +5,4 @@ $ java -classpath ${classpath:=.} async/yield/Main ${srcdir:=.}/../platforms/sma > [ 0.000000] (0:maestro@) Using regular java threads. > [ 0.000000] (1:Yielder@Boivin) Yielded 10. Good bye now! > [ 0.000000] (2:Yielder@Bourassa) Yielded 15. Good bye now! -> [ 0.000000] (0:maestro@) MSG_main finished; Cleaning up the simulation... +> [ 0.000000] (0:maestro@) MSG_main finished; Terminating the simulation... diff --git a/examples/java/cloud/masterworker/cloud-masterworker.tesh b/examples/java/cloud/masterworker/cloud-masterworker.tesh index 4010bc5840..d33fcb0c74 100644 --- a/examples/java/cloud/masterworker/cloud-masterworker.tesh +++ b/examples/java/cloud/masterworker/cloud-masterworker.tesh @@ -202,4 +202,4 @@ $ java -classpath ${classpath:=.} cloud/masterworker/Main ${srcdir:=.}/../platfo > [49971.662691] (1:Master@Boivin) Send some work to everyone > [49973.849223] (1:Master@Boivin) Suspend all VMs, wait a while, resume them, migrate them and shut them down. > [50971.662691] (1:Master@Boivin) XXXXXXXXXXXXXXX Step 50 done. -> [50971.662691] (0:maestro@) MSG_main finished; Cleaning up the simulation... +> [50971.662691] (0:maestro@) MSG_main finished; Terminating the simulation... diff --git a/examples/java/cloud/migration/cloud-migration.tesh b/examples/java/cloud/migration/cloud-migration.tesh index 574046458d..71be7e6f20 100644 --- a/examples/java/cloud/migration/cloud-migration.tesh +++ b/examples/java/cloud/migration/cloud-migration.tesh @@ -30,4 +30,4 @@ $ java -classpath ${classpath:=.} cloud/migration/Main ${srcdir:=.}/../platforms > [PM0:Test:(1) 183.918679] [java/INFO] End of migration of VM vm0 to node PM0 > [PM0:Test:(1) 183.918679] [java/INFO] - End of Migration from PM1 to PM0 (duration:43.207501264227034) > [PM0:Test:(1) 183.918679] [java/INFO] Forcefully destroy VMs -> [183.918679] [java/INFO] MSG_main finished; Cleaning up the simulation... +> [183.918679] [java/INFO] MSG_main finished; Terminating the simulation... diff --git a/examples/java/dht/chord/dht-chord.tesh b/examples/java/dht/chord/dht-chord.tesh index cb93b6dc4f..6f75315785 100644 --- a/examples/java/dht/chord/dht-chord.tesh +++ b/examples/java/dht/chord/dht-chord.tesh @@ -4,7 +4,7 @@ $ java -classpath ${classpath:=.} dht/chord/Main ${srcdir:=.}/../platforms/cluster.xml ${srcdir:=.}/dht/chord/chord.xml > [0.000000] [java/INFO] Using regular java threads. -> [1046.732943] [java/INFO] MSG_main finished; Cleaning up the simulation... +> [1046.732943] [java/INFO] MSG_main finished; Terminating the simulation... > [node-1.acme.org:dht.chord.Node:(2) 0.000000] [java/INFO] Joining the ring with id 366680 knowing node 42 > [node-2.acme.org:dht.chord.Node:(3) 0.000000] [java/INFO] Joining the ring with id 533744 knowing node 366680 > [node-3.acme.org:dht.chord.Node:(4) 0.000000] [java/INFO] Joining the ring with id 1319738 knowing node 42 diff --git a/examples/java/dht/kademlia/dht-kademlia.tesh b/examples/java/dht/kademlia/dht-kademlia.tesh index c69a56d0d9..d2c6f69cb2 100644 --- a/examples/java/dht/kademlia/dht-kademlia.tesh +++ b/examples/java/dht/kademlia/dht-kademlia.tesh @@ -4,7 +4,7 @@ $ java -classpath ${classpath:=.} dht/kademlia/Main ${srcdir:=.}/../platforms/cluster.xml ${srcdir:=.}/dht/kademlia/kademlia.xml > [0.000000] [java/INFO] Using regular java threads. -> [900.000000] [java/INFO] MSG_main finished; Cleaning up the simulation... +> [900.000000] [java/INFO] MSG_main finished; Terminating the simulation... > [node-0.acme.org:dht.kademlia.Node:(1) 0.000000] [java/INFO] Hi, I'm going to create the network with the id 0! > [node-0.acme.org:dht.kademlia.Node:(1) 900.000000] [java/INFO] 8/8 FIND_NODE have succedded. > [node-1.acme.org:dht.kademlia.Node:(2) 0.000000] [java/INFO] Hi, I'm going to join the network with the id 1! diff --git a/examples/java/energy/consumption/Main.java b/examples/java/energy/consumption/Main.java index d9b7991008..0fdc76e4ff 100644 --- a/examples/java/energy/consumption/Main.java +++ b/examples/java/energy/consumption/Main.java @@ -29,6 +29,5 @@ public class Main { new EnergyConsumer("MyHost1","energyConsumer").start(); /* Execute the simulation */ Msg.run(); - Msg.info("Total simulation time: "+ Msg.getClock()); } } diff --git a/examples/java/energy/consumption/energy-consumption.tesh b/examples/java/energy/consumption/energy-consumption.tesh index 8bd70cfe14..4a6c3ff359 100644 --- a/examples/java/energy/consumption/energy-consumption.tesh +++ b/examples/java/energy/consumption/energy-consumption.tesh @@ -9,8 +9,7 @@ $ java -classpath ${classpath:=.} energy/consumption/Main ${srcdir:=.}/../platfo > [MyHost1:energyConsumer:(1) 10.000000] [java/INFO] Currently consumed energy after sleeping 10 sec: 1000.0 > [MyHost1:energyConsumer:(1) 20.000000] [java/INFO] Currently consumed energy after executing 1E9 flops: 2200.0 > [20.000000] [surf_energy/INFO] Total energy consumption: 6200.000000 Joules (used hosts: 2200.000000 Joules; unused/idle hosts: 4000.000000) -> [20.000000] [java/INFO] MSG_main finished; Cleaning up the simulation... -> [20.000000] [java/INFO] Total simulation time: 20.0 +> [20.000000] [java/INFO] MSG_main finished; Terminating the simulation... > [20.000000] [surf_energy/INFO] Energy consumption of host MyHost1: 2200.000000 Joules > [20.000000] [surf_energy/INFO] Energy consumption of host MyHost2: 2000.000000 Joules > [20.000000] [surf_energy/INFO] Energy consumption of host MyHost3: 2000.000000 Joules diff --git a/examples/java/energy/pstate/energy-pstate.tesh b/examples/java/energy/pstate/energy-pstate.tesh index f1000a3b1d..b35f2d28bb 100644 --- a/examples/java/energy/pstate/energy-pstate.tesh +++ b/examples/java/energy/pstate/energy-pstate.tesh @@ -19,7 +19,7 @@ $ java -classpath ${classpath:=.} energy/pstate/Main ${srcdir:=.}/../platforms/e > [MyHost2:dvfs_test:(3) 6.000000] [java/INFO] Count of Processor states=3 > [MyHost2:dvfs_test:(3) 6.000000] [java/INFO] Current power peak=2.0E7 > [6.000000] [surf_energy/INFO] Total energy consumption: 2195.000000 Joules (used hosts: 1595.000000 Joules; unused/idle hosts: 600.000000) -> [6.000000] [java/INFO] MSG_main finished; Cleaning up the simulation... +> [6.000000] [java/INFO] MSG_main finished; Terminating the simulation... > [6.000000] [surf_energy/INFO] Energy consumption of host MyHost1: 645.000000 Joules > [6.000000] [surf_energy/INFO] Energy consumption of host MyHost2: 950.000000 Joules > [6.000000] [surf_energy/INFO] Energy consumption of host MyHost3: 600.000000 Joules diff --git a/examples/java/energy/vm/energy-vm.tesh b/examples/java/energy/vm/energy-vm.tesh index f3348f6f09..e07fb6aa35 100644 --- a/examples/java/energy/vm/energy-vm.tesh +++ b/examples/java/energy/vm/energy-vm.tesh @@ -15,7 +15,7 @@ $ java -classpath ${classpath:=.} energy/vm/Main ${srcdir:=.}/../platforms/energ > [vmHost1:p11:(2) 6.000000] [java/INFO] This worker is done. > [vmHost3:p21:(4) 6.000000] [java/INFO] This worker is done. > [10.000000] [surf_energy/INFO] Total energy consumption: 4320.000000 Joules (used hosts: 4320.000000 Joules; unused/idle hosts: 0.000000) -> [10.000000] [java/INFO] MSG_main finished; Cleaning up the simulation... +> [10.000000] [java/INFO] MSG_main finished; Terminating the simulation... > [10.000000] [surf_energy/INFO] Energy consumption of host MyHost1: 1120.000000 Joules > [10.000000] [surf_energy/INFO] Energy consumption of host MyHost2: 1600.000000 Joules > [10.000000] [surf_energy/INFO] Energy consumption of host MyHost3: 1600.000000 Joules diff --git a/examples/java/io/file/io-file.tesh b/examples/java/io/file/io-file.tesh index 1830e428ed..d70cf9083a 100644 --- a/examples/java/io/file/io-file.tesh +++ b/examples/java/io/file/io-file.tesh @@ -23,4 +23,4 @@ $ java -classpath ${classpath:=.} io/file/Main ${srcdir:=.}/../platforms/storage > [carl:2:(3) 0.003433] [java/INFO] Seek back to the beginning of /home/doc/simgrid/examples/platforms/g5k_cabinets.xml > [bob:1:(2) 0.004414] [java/INFO] Having read 104028 on /home/doc/simgrid/examples/platforms/nancy.xml > [carl:2:(3) 0.004533] [java/INFO] Having read 110000 on /home/doc/simgrid/examples/platforms/g5k_cabinets.xml -> [0.004533] [java/INFO] MSG_main finished; Cleaning up the simulation... +> [0.004533] [java/INFO] MSG_main finished; Terminating the simulation... diff --git a/examples/java/io/storage/io-storage.tesh b/examples/java/io/storage/io-storage.tesh index 87153bef28..d89cae9b61 100644 --- a/examples/java/io/storage/io-storage.tesh +++ b/examples/java/io/storage/io-storage.tesh @@ -22,4 +22,4 @@ $ java -classpath ${classpath:=.} io/storage/Main ${srcdir:=.}/../platforms/stor > [denise:0:(1) 0.000000] [java/INFO] Disk: Disk2 > [denise:0:(1) 0.000000] [java/INFO] Disk: Disk3 > [denise:0:(1) 0.000000] [java/INFO] Disk: Disk4 -> [0.000000] [java/INFO] MSG_main finished; Cleaning up the simulation... +> [0.000000] [java/INFO] MSG_main finished; Terminating the simulation... diff --git a/examples/java/process/kill/process-kill.tesh b/examples/java/process/kill/process-kill.tesh index 15becffb1a..aa210f3f06 100644 --- a/examples/java/process/kill/process-kill.tesh +++ b/examples/java/process/kill/process-kill.tesh @@ -9,4 +9,4 @@ $ java -classpath ${classpath:=.} process/kill/Main ${srcdir:=.}/../platforms/sm > [Boivin:victim:(2) 10.000000] [java/INFO] OK, OK. Let's work > [Jacquelin:killer:(1) 11.000000] [java/INFO] Kill Process > [Jacquelin:killer:(1) 11.000000] [java/INFO] Ok, goodbye now. -> [11.000000] [java/INFO] MSG_main finished; Cleaning up the simulation... +> [11.000000] [java/INFO] MSG_main finished; Terminating the simulation... diff --git a/examples/java/process/migration/process-migration.tesh b/examples/java/process/migration/process-migration.tesh index 75cb358fd5..b2a24e4d07 100644 --- a/examples/java/process/migration/process-migration.tesh +++ b/examples/java/process/migration/process-migration.tesh @@ -8,7 +8,7 @@ $ java -classpath ${classpath:=.} process/migration/Main ${srcdir:=.}/../platfor > [ 0.000000] (2:emigrant@Boivin) Yeah, found something to do > [ 1.000000] (1:policeman@Boivin) Wait a bit before migrating the emigrant. > [ 3.000000] (2:emigrant@Boivin) Moving back to home after work -> [ 7.000000] (0:maestro@) MSG_main finished; Cleaning up the simulation... +> [ 7.000000] (0:maestro@) MSG_main finished; Terminating the simulation... > [ 7.000000] (2:emigrant@Jacquelin) I've been moved on this new host:Jacquelin > [ 7.000000] (2:emigrant@Jacquelin) Uh, nothing to do here. Stopping now > [ 7.000000] (1:policeman@Boivin) I moved the emigrant diff --git a/examples/java/process/startkilltime/process-startkilltime.tesh b/examples/java/process/startkilltime/process-startkilltime.tesh index 7e5dbd9b4e..d3a5f486ff 100644 --- a/examples/java/process/startkilltime/process-startkilltime.tesh +++ b/examples/java/process/startkilltime/process-startkilltime.tesh @@ -9,4 +9,4 @@ $ java -classpath ${classpath:=.} process/startkilltime/Main ${srcdir:=.}/../pla > [node-5.acme.org:process.startkilltime.Sleeper:(6) 5.000000] [java/INFO] Hello! I go to sleep. > [node-2.acme.org:process.startkilltime.Sleeper:(3) 6.000000] [java/INFO] Done sleeping > [node-3.acme.org:process.startkilltime.Sleeper:(4) 7.000000] [java/INFO] Done sleeping -> [10.000000] [java/INFO] MSG_main finished; Cleaning up the simulation... +> [10.000000] [java/INFO] MSG_main finished; Terminating the simulation... diff --git a/examples/java/process/suspend/process-suspend.tesh b/examples/java/process/suspend/process-suspend.tesh index 7c771a2f3c..45e219bd81 100644 --- a/examples/java/process/suspend/process-suspend.tesh +++ b/examples/java/process/suspend/process-suspend.tesh @@ -6,7 +6,7 @@ $ java -classpath ${classpath:=.} process/suspend/Main ${srcdir:=.}/../platforms > [ 0.000000] (1:DreamMaster@Jacquelin) Let's create a lazy guy. > [ 0.000000] (1:DreamMaster@Jacquelin) Let's wait a little bit... > [ 0.000000] (2:Lazy@Jacquelin) Nobody's watching me ? Let's go to sleep. -> [ 10.000000] (0:maestro@) MSG_main finished; Cleaning up the simulation... +> [ 10.000000] (0:maestro@) MSG_main finished; Terminating the simulation... > [ 10.000000] (1:DreamMaster@Jacquelin) Let's wake the lazy guy up! >:) BOOOOOUUUHHH!!!! > [ 10.000000] (1:DreamMaster@Jacquelin) OK, goodbye now. > [ 10.000000] (2:Lazy@Jacquelin) Uuuh ? Did somebody call me ? diff --git a/examples/java/task/priority/task-priority.tesh b/examples/java/task/priority/task-priority.tesh index f88e895c31..195fe6aa0c 100644 --- a/examples/java/task/priority/task-priority.tesh +++ b/examples/java/task/priority/task-priority.tesh @@ -7,5 +7,5 @@ $ java -classpath ${classpath:=.} task/priority/Main ${srcdir:=.}/../platforms/s > [ 0.000000] (1:task.priority.Test@Fafard) Hello! Running a task of size 7.6296E7 with priority 1.0 > [ 0.000000] (2:task.priority.Test@Fafard) Hello! Running a task of size 7.6296E7 with priority 2.0 > [ 1.500000] (2:task.priority.Test@Fafard) Goodbye now! -> [ 2.000000] (0:maestro@) MSG_main finished; Cleaning up the simulation... +> [ 2.000000] (0:maestro@) MSG_main finished; Terminating the simulation... > [ 2.000000] (1:task.priority.Test@Fafard) Goodbye now! diff --git a/examples/java/trace/pingpong/trace-pingpong.tesh b/examples/java/trace/pingpong/trace-pingpong.tesh index 84b951b72a..7e1fee9dc0 100644 --- a/examples/java/trace/pingpong/trace-pingpong.tesh +++ b/examples/java/trace/pingpong/trace-pingpong.tesh @@ -7,7 +7,7 @@ $ java -classpath ${classpath:=.} trace/pingpong/Main ${srcdir:=.}/../platforms/ > [0.000000] [xbt_cfg/INFO] Configuration change: Set 'tracing' to 'yes' > [0.000000] [xbt_cfg/INFO] Configuration change: Set 'tracing/filename' to 'simulation.trace' > [0.000000] [xbt_cfg/INFO] Configuration change: Set 'tracing/platform' to 'yes' -> [3.817809] [java/INFO] MSG_main finished; Cleaning up the simulation... +> [3.817809] [java/INFO] MSG_main finished; Terminating the simulation... > [Boivin:Receiver:(2) 0.000000] [java/INFO] hello! > [Boivin:Receiver:(2) 0.000000] [java/INFO] try to get a task > [Boivin:Receiver:(2) 1.048882] [java/INFO] Got at time 1.0488818628325232 diff --git a/src/bindings/java/JavaContext.cpp b/src/bindings/java/JavaContext.cpp index 7796bda171..5fe1c0cfab 100644 --- a/src/bindings/java/JavaContext.cpp +++ b/src/bindings/java/JavaContext.cpp @@ -29,6 +29,7 @@ ContextFactory* java_factory() JavaContextFactory::JavaContextFactory(): ContextFactory("JavaContextFactory") { + xbt_binary_name = xbt_strdup("java"); // Used by the backtrace displayer } JavaContextFactory::~JavaContextFactory()=default; @@ -46,53 +47,24 @@ JavaContext* JavaContextFactory::create_context(std::function code, void void JavaContextFactory::run_all() { - for (smx_actor_t const& process : simgrid::simix::process_get_runnable()) { - static_cast(process->context_)->resume(); - } + SerialThreadContext::run_all(); } -JavaContext::JavaContext(std::function code, - void_pfn_smxprocess_t cleanup_func, - smx_actor_t process) - : Context(std::move(code), cleanup_func, process) +JavaContext::JavaContext(std::function code, void_pfn_smxprocess_t cleanup_func, smx_actor_t process) + : SerialThreadContext(std::move(code), cleanup_func, process, false /* not maestro */) { - /* If the user provided a function for the process then use it. Otherwise is the context for maestro */ - if (has_code()) { - this->begin_ = xbt_os_sem_init(0); - this->end_ = xbt_os_sem_init(0); - - this->thread_ = xbt_os_thread_create(nullptr, JavaContext::wrapper, this, nullptr); - } else { - xbt_os_thread_set_extra_data(this); - } + /* ThreadContext already does all we need */ } -JavaContext::~JavaContext() +void JavaContext::start_hook() { - if (this->thread_) { - // We are not in maestro context - xbt_os_thread_join(this->thread_, nullptr); - xbt_os_sem_destroy(this->begin_); - xbt_os_sem_destroy(this->end_); - } -} + xbt_os_thread_set_extra_data(this); // We need to attach it also for maestro, in contrary to our ancestor -void* JavaContext::wrapper(void *data) -{ - JavaContext* context = static_cast(data); - xbt_os_thread_set_extra_data(context); //Attach the thread to the JVM - JNIEnv *env; XBT_ATTRIB_UNUSED jint error = __java_vm->AttachCurrentThread((void**)&env, nullptr); xbt_assert((error == JNI_OK), "The thread could not be attached to the JVM"); - context->jenv_ = env; - //Wait for the first scheduling round to happen. - xbt_os_sem_acquire(context->begin_); - //Create the "Process" object if needed. - (*context)(); - context->stop(); - return nullptr; + this->jenv_ = env; } void JavaContext::stop() @@ -100,7 +72,7 @@ void JavaContext::stop() /* I was asked to die (either with kill() or because of a failed element) */ if (this->iwannadie) { this->iwannadie = 0; - JNIEnv *env = get_current_thread_env(); + JNIEnv* env = this->jenv_; XBT_DEBUG("Gonna launch Killed Error"); // When the process wants to stop before its regular end, we should cut its call stack quickly. // The easiest way to do so is to raise an exception that will be catched in its top calling level. @@ -133,32 +105,16 @@ void JavaContext::stop() // In other words, we need to do in C++ what we do in Java for sake of uniformity. // // Plus, C++ RAII would work in that case, too. - XBT_DEBUG("Trigger a cancel error at the C level"); THROWF(cancel_error, 0, "process cancelled"); } else { - Context::stop(); - /* detach the thread and kills it */ + ThreadContext::stop(); JNIEnv* env = this->jenv_; env->DeleteGlobalRef(this->jprocess_); XBT_ATTRIB_UNUSED jint error = __java_vm->DetachCurrentThread(); xbt_assert((error == JNI_OK), "The thread couldn't be detached."); - xbt_os_sem_release(this->end_); xbt_os_thread_exit(nullptr); } } -void JavaContext::suspend() -{ - xbt_os_sem_release(this->end_); - xbt_os_sem_acquire(this->begin_); -} - -// FIXME: inline those functions -void JavaContext::resume() -{ - xbt_os_sem_release(this->begin_); - xbt_os_sem_acquire(this->end_); -} - }}} // namespace simgrid::kernel::context diff --git a/src/bindings/java/JavaContext.hpp b/src/bindings/java/JavaContext.hpp index 2724033d4b..0a07f01358 100644 --- a/src/bindings/java/JavaContext.hpp +++ b/src/bindings/java/JavaContext.hpp @@ -12,6 +12,7 @@ #include #include "simgrid/simix.h" +#include "src/kernel/context/ContextThread.hpp" #include "src/simix/smx_private.hpp" #include "xbt/xbt_os_thread.h" @@ -24,28 +25,20 @@ namespace context { class JavaContext; class JavacontextFactory; -class JavaContext : public simgrid::kernel::context::Context { +class JavaContext : public simgrid::kernel::context::SerialThreadContext { public: // The java process instance bound with the msg process structure: jobject jprocess_ = nullptr; // JNI interface pointer associated to this thread: JNIEnv* jenv_ = nullptr; - xbt_os_thread_t thread_ = nullptr; - // Sempahore used to schedule/yield to the process: - xbt_os_sem_t begin_ = nullptr; - // Semaphore used to schedule/unschedule the process: - xbt_os_sem_t end_ = nullptr; friend class JavaContextFactory; JavaContext(std::function code, void_pfn_smxprocess_t cleanup_func, smx_actor_t process); - ~JavaContext() override; + + void start_hook() override; void stop() override; - void suspend() override; - void resume(); -private: - static void* wrapper(void *data); }; class JavaContextFactory : public simgrid::kernel::context::ContextFactory { diff --git a/src/bindings/java/jmsg.cpp b/src/bindings/java/jmsg.cpp index 053f6ce802..143a28bfa0 100644 --- a/src/bindings/java/jmsg.cpp +++ b/src/bindings/java/jmsg.cpp @@ -132,7 +132,7 @@ JNIEXPORT void JNICALL JNICALL Java_org_simgrid_msg_Msg_run(JNIEnv * env, jclass jxbt_check_res("MSG_main()", rv, MSG_OK, xbt_strdup("unexpected error : MSG_main() failed .. please report this bug ")); - XBT_INFO("MSG_main finished; Cleaning up the simulation..."); + XBT_INFO("MSG_main finished; Terminating the simulation..."); /* Cleanup java hosts */ xbt_dynar_t hosts = MSG_hosts_as_dynar(); for (unsigned long index = 0; index < xbt_dynar_length(hosts) - 1; index++) { @@ -140,13 +140,16 @@ JNIEXPORT void JNICALL JNICALL Java_org_simgrid_msg_Msg_run(JNIEnv * env, jclass jobject jhost = (jobject) msg_host->extension(JAVA_HOST_LEVEL); if (jhost) jhost_unref(env, jhost); - } xbt_dynar_free(&hosts); /* Cleanup java storages */ for (auto const& elm : java_storage_map) jstorage_unref(env, elm.second); + + /* FIXME: don't be of such an EXTREM BRUTALITY to stop the jvm. Sorry I don't get it working otherwise. + * See the comment in ActorImpl.cpp::SIMIX_process_kill() */ + exit(0); } JNIEXPORT void JNICALL Java_org_simgrid_msg_Msg_createEnvironment(JNIEnv * env, jclass cls, jstring jplatformFile) @@ -250,9 +253,11 @@ static void run_jprocess(JNIEnv *env, jobject jprocess) jdouble startTime = env->GetDoubleField(jprocess, jprocess_field_Process_startTime); if (startTime > MSG_get_clock()) MSG_process_sleep(startTime - MSG_get_clock()); + //Execution of the "run" method. jmethodID id = jxbt_get_smethod(env, "org/simgrid/msg/Process", "run", "()V"); - xbt_assert((id != nullptr), "Method run() not found..."); + xbt_assert((id != nullptr), "Method Process.run() not found..."); + env->CallVoidMethod(jprocess, id); } diff --git a/src/bindings/java/jmsg_process.cpp b/src/bindings/java/jmsg_process.cpp index 758b3814e4..7580858ef9 100644 --- a/src/bindings/java/jmsg_process.cpp +++ b/src/bindings/java/jmsg_process.cpp @@ -226,7 +226,11 @@ JNIEXPORT void JNICALL Java_org_simgrid_msg_Process_sleep(JNIEnv *env, jclass cl { double time = ((double)jmillis) / 1000 + ((double)jnanos) / 1000000000; msg_error_t rv; - rv = MSG_process_sleep(time); + try { + rv = MSG_process_sleep(time); + } catch (simgrid::kernel::context::Context::StopRequest const&) { + rv = MSG_HOST_FAILURE; + } if (rv != MSG_OK) { XBT_DEBUG("Something during the MSG_process_sleep invocation was wrong, trigger a HostFailureException"); diff --git a/src/bindings/java/org/simgrid/NativeLib.java b/src/bindings/java/org/simgrid/NativeLib.java index da9801dcdd..7fb11031d5 100644 --- a/src/bindings/java/org/simgrid/NativeLib.java +++ b/src/bindings/java/org/simgrid/NativeLib.java @@ -79,7 +79,7 @@ public final class NativeLib { System.err.println(); cause.printStackTrace(); } else { - System.err.println("This jar file does not seem to fit your system, and no usable SimGrid installation found on disk."); + System.err.println("This jar file does not seem to fit your system, and no usable SimGrid installation found on disk for "+name+"."); } System.exit(1); } diff --git a/src/kernel/context/ContextThread.hpp b/src/kernel/context/ContextThread.hpp index a0b958e8b4..48a9d1aa11 100644 --- a/src/kernel/context/ContextThread.hpp +++ b/src/kernel/context/ContextThread.hpp @@ -40,8 +40,8 @@ private: void start(); // match a call to release() void yield(); // match a call to yield() - virtual void start_hook() { /* empty placeholder, called after start() */} - virtual void yield_hook() { /* empty placeholder, called before yield() */} + virtual void start_hook() { /* empty placeholder, called after start(). Used in parallel mode and Java */} + virtual void yield_hook() { /* empty placeholder, called before yield(). Used in parallel mode */} static void* wrapper(void *param); }; diff --git a/src/simix/ActorImpl.cpp b/src/simix/ActorImpl.cpp index a91c4c8274..5842df4dff 100644 --- a/src/simix/ActorImpl.cpp +++ b/src/simix/ActorImpl.cpp @@ -500,8 +500,30 @@ void SIMIX_process_kill(smx_actor_t process, smx_actor_t issuer) { process->exception = nullptr; // Forcefully kill the actor if its host is turned off. Not an HostFailureException because you should not survive that - if (process->host_->is_off()) - process->throw_exception(std::make_exception_ptr(simgrid::kernel::context::Context::StopRequest("Host failed"))); + if (process->host_->is_off()) { + /* HORRIBLE HACK: Don't throw an StopRequest exception in Java, because it breaks sometimes. + * + * It seems to break for the actors started from the Java world, with new Process() + * while it works for the ones started from the C world, with the deployment file. + * When it happens, the simulation stops brutally with a message "untrapped exception StopRequest". + * + * From what I understand, it works for the native actors because they have a nice try/catch block around their main + * but I fail to have something like that for pure Java actors. That's probably a story of C->Java vs Java->C + * calling conventions. The right solution may be to have try/catch(StopRequest) blocks around each native call in + * JNI. ie, protect every Java->C++ call from C++ exceptions. But this sounds long and painful to do before we + * switch to an automatic generator such as SWIG. For now, we don't throw here that exception that we sometimes fail + * to catch. + * + * One of the unfortunate outcome is that the threads started from the deployment file are not stopped anymore. + * Or maybe this is the actors stopping gracefully as opposed to the killed ones? Or maybe this is absolutely all + * actors of the Java simulation? I'm not sure. Anyway. Because of them, the simulation hangs at the end, waiting + * for them to stop but they won't. The current answer to that is very brutal: + * we do a "exit(0)" to kill the JVM from the C code after the call to MSG_run(). Definitely unpleasant. + */ + + if (simgrid::kernel::context::factory_initializer == nullptr) // Only Java sets a factory_initializer, for now + process->throw_exception(std::make_exception_ptr(simgrid::kernel::context::Context::StopRequest("Host failed"))); + } /* destroy the blocking synchro if any */ if (process->waiting_synchro != nullptr) { @@ -760,6 +782,7 @@ void SIMIX_process_yield(smx_actor_t self) XBT_DEBUG("Process %s@%s is dead", self->get_cname(), self->host_->get_cname()); self->context_->stop(); + xbt_backtrace_display_current(); xbt_die("I should be dead by now."); } diff --git a/src/simix/smx_context.cpp b/src/simix/smx_context.cpp index d07d747b97..6f32bb18b6 100644 --- a/src/simix/smx_context.cpp +++ b/src/simix/smx_context.cpp @@ -101,7 +101,7 @@ void SIMIX_context_mod_init() #endif /* select the context factory to use to create the contexts */ - if (simgrid::kernel::context::factory_initializer) { // Give Java a chance to hijack the factory mechanism + if (simgrid::kernel::context::factory_initializer != nullptr) { // Give Java a chance to hijack the factory mechanism simix_global->context_factory = simgrid::kernel::context::factory_initializer(); return; } diff --git a/teshsuite/java/semaphoregc/semaphoregc.tesh b/teshsuite/java/semaphoregc/semaphoregc.tesh index a321b134ad..8cbe59073a 100644 --- a/teshsuite/java/semaphoregc/semaphoregc.tesh +++ b/teshsuite/java/semaphoregc/semaphoregc.tesh @@ -4,4 +4,4 @@ $ java -classpath ${classpath:=.} semaphoregc.SemaphoreGC ${srcdir:=.}/small_pla > [ 0.0000] (0:maestro@) Using regular java threads. > [ 0.0000] (1:SemCreator@Fafard) Creating 50 new Semaphores, yielding and triggering a GC after each > [ 500.0000] (1:SemCreator@Fafard) It worked, we survived. The test is passed. -> [ 500.0000] (0:maestro@) MSG_main finished; Cleaning up the simulation... +> [ 500.0000] (0:maestro@) MSG_main finished; Terminating the simulation... diff --git a/teshsuite/java/sleephostoff/sleephostoff.tesh b/teshsuite/java/sleephostoff/sleephostoff.tesh index 55aabb59da..c10a19d547 100644 --- a/teshsuite/java/sleephostoff/sleephostoff.tesh +++ b/teshsuite/java/sleephostoff/sleephostoff.tesh @@ -9,4 +9,4 @@ $ java -classpath ${classpath:=.} sleephostoff.SleepHostOff ${srcdir:=.}/small_p > [ 0.020000] (2:Sleeper@Tremblay) catch HostException: Host Failure > [ 0.020000] (1:TestRunner@Fafard) Tremblay has been stopped > [ 0.320000] (1:TestRunner@Fafard) Test sleep seems ok, cool! (number of Process : 1, it should be 1 (i.e. the Test one)) -> [ 0.320000] (0:maestro@) MSG_main finished; Cleaning up the simulation... +> [ 0.320000] (0:maestro@) MSG_main finished; Terminating the simulation...